#rmarkdown tips
# https://bookdown.org/yihui/rmarkdown/basics.html


### for code folding, put this under html options above and set echo = T below:
# output: 
#   html_document:
#     code_folding: hide


# make sections with R code
# https://stackoverflow.com/questions/36674824/use-loop-to-generate-section-of-text-in-rmarkdown 

knitr::opts_chunk$set(echo = T, warning=FALSE, message=FALSE, results = 'hold', fig.width = 7, fig.height = 7)

#set whole project directory (use this instead of setwd() in rmd)
# knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
#hopefully commenting just works with cwd

#set do not stop knitting even with errors; obviously dangerous, for testing only
knitr::opts_chunk$set(
  error = F # if true, do not interrupt in case of errors
)


#set timer

timestart = proc.time()


#set memory limit
# this is for parallel via future (for markers)
# some seurat functions also rely on this under the hood...
# pipeline may fail if this is not set


# options(future.globals.maxSize = 850*1024^2) # 850 MB
# options(future.globals.maxSize = 850*1024^2) # 1 GB
#options(future.globals.maxSize = 8000 * 1024^2) # 8 GB

# mostly for Seurat future, increase max mem per core, don't think anything except future uses this
options(future.globals.maxSize = 15000 * 1024^2) # 15 GB



# set seed
set.seed(2022)
## for testing and / or manually setting things only;
## keep commented otherwise



# bindingIsLocked("params", env = .GlobalEnv)
# unlockBinding("params", env = .GlobalEnv)
# 
# projdir = '/Users/alexanderferrena/Dropbox (EinsteinMed)/Kevyn_SDAP_Test/'
# 
# params <- list(
#   datadir = paste0(projdir, 'NewReplicate_GFPpos_GFPneg/datadir/'),
#   outdir = paste0(projdir, 'outs/DEVTEST/' ),
#   
#   use_labeltransfer = F,
#   # refdatapath = paste0(projdir, 'Reference_Seurat_Generation/Fuwai_E105_Wn1Cre_Hearts/CNCCsE105_Dub_namedClusters.rds'),
#   # m_reference = paste0(projdir, 'Reference_Seurat_Generation/Fuwai_E105_Wn1Cre_Hearts/Markers.rds'),
#   
#   sample_metadata = paste0(projdir, '/NewReplicate_GFPpos_GFPneg/sample_metadata.csv'),
#   comps = paste0(projdir, '/NewReplicate_GFPpos_GFPneg/comps.csv'),
#   
#   min_num_UMI = 500,
#   min_num_Feature = 200,
#   max_perc_mito = 25,
#   max_perc_hemoglobin = 25,
#   autofilter_complexity = F,
#   autofilter_mito = FALSE,
#   autofilter_nUMI = F,
#   autofilter_medianabsolutedev_threshold = 3,
#   autofilter_loess_negative_residual_threshold = -5,
#   
#   doubletFinder = T,
#   
#   
#   pcs_indi = 30,
#   res_indi = 0.5,
#   pcs_int = 30,
#   res_int = 1,
#   RISC_louvain_neighbors=10,
#   
#   risc_reference = NULL,
#   crossconditionDE_padj_thres = NULL,
#   crossconditionDE_lfc_thres = NULL,
#   pathway_padj_thres = 0.1,
#   species = 'Mus musculus',
#   workernum = 4,
#   input_seurat_obj = F,
#   force_redo = F
# )











#set verbosity --> deprecated, keep false
verbose=F


## key params, setting paths
datadir <- paste0(params$datadir, '/')

if(is.null(datadir)){stop('Please set datadir, path to cellranger results')}

#input seurat obj
input_seurat_obj <- params$input_seurat_obj
if(is.null(input_seurat_obj)){input_seurat_obj <- F}



# set output dir
outdir <- paste0(params$outdir, '/')
if(is.null(outdir)){stop('Please set outdir, output folder')}

dir.create(outdir, recursive = T)

#set up output dir for individual samples
outdir_indi <- paste0(outdir, '/individualsample_analysis')

dir.create(outdir_indi)

#prep a dir for int results
outdir_int <- paste0(outdir, '/multisample_integration/') 

dir.create(outdir_int)




#set path to reference data for label transfer.


use_labeltransfer <- params$use_labeltransfer
if(is.null(use_labeltransfer)){stop('Please set use_labeltransfer to T/F; if T, please also provide paths refdatapath and m_reference')}

if(use_labeltransfer == T){
  
  refdatapath <- params$refdatapath
  if(is.null(refdatapath)){stop('Please set refdatapath, path to reference scRNAseq Seurat object .rds file with "Celltype" in meta.data')}
  
  
  
  # ref markers
  m_reference <- params$m_reference
  
  ### just added: SeuratLabelTransfer.normalization.method

  # SeuratLabelTransfer.normalization.method <- params$SeuratLabelTransfer.normalization.method
  
  if(is.null(m_reference)){stop('Please set m_reference, path to FindAllMarkers result from reference celltypes, saved as .rds file')}
  
} else{
  refdatapath <- NA
  m_reference <- NA
  # SeuratLabelTransfer.normalization.method <- NA
}




#de test use
Pseudobulk_mode <- params$Pseudobulk_mode
if(is.null(Pseudobulk_mode)){stop('Please set "Pseudobulk_mode" as either TRUE or FALSE')}


## analysis parameters: dimreduction and clustering hyperparameters

# just use defaults in params since its easier, except for pwaycats

pwaycats <- c("HALLMARK", "GO_BP", "GO_MF", "GO_CC", "CP_REACTOME", "CP_KEGG")


#get risc reference
risc_reference <- params$risc_reference


#autofilter parameters
min_num_UMI = params$min_num_UMI
min_num_Feature = params$min_num_Feature
max_perc_mito = params$max_perc_mito
max_perc_hemoglobin = params$max_perc_hemoglobin
autofilter_complexity = params$autofilter_complexity
autofilter_mito = params$autofilter_mito
autofilter_nUMI = params$autofilter_nUMI

autofilter_medianabsolutedev_threshold = params$autofilter_medianabsolutedev_threshold
autofilter_loess_negative_residual_threshold = params$autofilter_loess_negative_residual_threshold

doubletFinder = params$doubletFinder

# #check package versions and add warnings
# if( (packageVersion('DoubletFinder') == "2.0.3") & (packageVersion('Seurat') >= "5.0.0") ){
#   warning('DoubletFinder v2.0.3 (latest update as of Nov 14 2023) is not compatible with Seurat > v5.0.0.\nTurning off DoubletFinder.\nTo use DoubletFinder, try downgrading to Seurat v4:', 
#           '\n',
#           "https://satijalab.org/seurat/articles/install_v5#install-previous-versions-of-seurat"
#           
#           )
#   
#   doubletFinder <- F
# }


if( (packageVersion('DoubletFinder') == "2.0.3") & (packageVersion('Seurat') >= "5.0.0") ){
  
  warning('Will attempt to coerce Seurat v5 object to work with DoubletFinder v2.0.3; this is unstable and does not always work! If any errors arise, set doubletFinder to FALSE in pipeline runner')
  
}



pcs_indi <- params$pcs_indi
res_indi <- params$res_indi
pcs_int <- params$pcs_int
res_int <- params$res_int
RISC_louvain_neighbors <- params$RISC_louvain_neighbors


#for these, defaults will be null, then set to lenient if pseudobulk and strict if wilcox

crossconditionDE_padj_thres <- params$crossconditionDE_padj_thres
crossconditionDE_lfc_thres <- params$crossconditionDE_lfc_thres


if(Pseudobulk_mode == T){
  
  if(is.null(crossconditionDE_padj_thres)){
    crossconditionDE_padj_thres <- 0.1
  }
  
  if(is.null(crossconditionDE_lfc_thres)){
    crossconditionDE_lfc_thres <- 0
  }
  
  
}

if(Pseudobulk_mode == F){
  
  if(is.null(crossconditionDE_padj_thres)){
    crossconditionDE_padj_thres <- 0.05
  }
  
  if(is.null(crossconditionDE_lfc_thres)){
    crossconditionDE_lfc_thres <- 0.25
  }
  
  
}


pathway_padj_thres <- params$pathway_padj_thres


# pwaycats: NULL
species <- params$species



## parallelization

workernum <- params$workernum

#redo / overwrite
# force_redo <- params$force_redo
force_redo = T


# ## save the parameter choices
# paramsave <- lapply(1:length(params), function(i){
#   var <- names(params)[i]
#   
#   get(var)
# })
# 
# names(paramsave) <- names(params)
# 
# 
# paramsave <- t(data.frame(paramsave))
# 
# paramsave <- cbind(rownames(paramsave), paramsave)
# rownames(paramsave) <- NULL




## defining DE comparisons and sample conditions with metadata

#do this below saving params, since we overwrite these names

#metadata
sample_metadata <- params$sample_metadata
if(is.null(sample_metadata)){stop('Please set sample_metadata, path to csv file with Sample column and Condition column')}

#parse pseudobulk
# read.csv...
sample_metadata <- read.csv(sample_metadata)

#parse factor
sample_metadata$Condition <- factor(sample_metadata$Condition,
                                        levels = unique(sample_metadata$Condition))

#add codes, optional

if(!('Code' %in% colnames(sample_metadata))){
  sample_metadata$Code <- paste0(sample_metadata$Condition, '_', sample_metadata$Sample)
}




#comparisons
comps <- params$comps

if(!is.null(comps)){ comps <- read.csv(comps) }
if(is.null(comps)){
  
  
  if( length(levels(sample_metadata$Condition)) > 2 ){
    
    warning('"comps" data.frame not provided, will try to guess from sample_metadata')
    comps <- data.frame(c1 = levels(sample_metadata$Condition)[1],
                        c2 = levels(sample_metadata$Condition)[2])
    
  } else{
    
    stop('Please set comps, path to csv file c1 and c2 column defining comparisons of conditions to use')
    
  }
  
  
}






#save params as a data.frame
pl <- list(datadir = datadir,
           outdir = outdir,
           sample_metadata = params$sample_metadata,
           comps = params$comps,
           
           use_labeltransfer = use_labeltransfer,
           refdatapath = refdatapath,
           m_reference = m_reference,
           # SeuratLabelTransfer.normalization.method=SeuratLabelTransfer.normalization.method,
           
           min_num_UMI = min_num_UMI,
           min_num_Feature = min_num_Feature,
           max_perc_mito = max_perc_mito,
           max_perc_hemoglobin = max_perc_hemoglobin,
           autofilter_mito = autofilter_mito,
           autofilter_nUMI = autofilter_nUMI,
           autofilter_complexity = autofilter_complexity,
           autofilter_medianabsolutedev_threshold = autofilter_medianabsolutedev_threshold,
           autofilter_loess_negative_residual_threshold = autofilter_loess_negative_residual_threshold,
           doubletFinder = doubletFinder,
           
           risc_reference = ifelse(is.null(risc_reference), 'auto', risc_reference) ,
           
           pcs_indi = pcs_indi,
           res_indi = res_indi,
           pcs_int = pcs_int,
           res_int = res_int,
           RISC_louvain_neighbors = RISC_louvain_neighbors,
           
           Pseudobulk_mode = Pseudobulk_mode,
           crossconditionDE_padj_thres = crossconditionDE_padj_thres,
           crossconditionDE_lfc_thres = crossconditionDE_lfc_thres,
           pathway_padj_thres = pathway_padj_thres,
           species = species,
           
           workernum = workernum,
           input_seurat_obj = input_seurat_obj
)

pldf <- data.frame(parameter = names(pl),
                   value = unlist(pl))

#write it out
write.csv(pldf, paste0(outdir, '/pipeline_parameters.csv'), quote = F, row.names = F)

1 Introduction

This report summarizes analysis of single-cell RNA-sequencing (scRNA-seq) data including single sample analysis, label transfer from a cell-type annotated reference (optional), integration with batch correction, differential expression with support for pseudobulk and multi-condition comparisons, and pathway analysis. All plots, tables, markers, Seurat objects, RISC object, DE results, and pathway analysis results are stored in the output folder.

The analysis pipelines in this report were developed by the lab of Dr. Deyou Zheng in the Department of Genetics and Department of Neuroscience at Albert Einstein College of Medicine. The pipeline was assembled primarily by PhD candidate Alexander Ferrena with additional input from all lab members.

1.1 Description of methods

Below is a short methods-section style description of the pipeline. More detailed information on can be found in each sub-section. If required, software versions of all R packages can be found at the bottom of the document in the SessionInfo section. The version of Cellranger can be found in the web_summary.html files provided with the data release. Please read carefully and adjust accordingly before using for manuscripts or applications.

text1 <- 
  '
Sequencing data from the 10x Chromium samples were first analyzed with Cellranger to generate cell level gene expression data [(10X Genomics)](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger). Cells with fewer than 500 Unique Molecular Identifiers (UMIs) detected, fewer than 200 unique genes detected, more than 25% mitochondrial content, or more than 25% hemoglobin gene expression were filtered out (unless otherwise specified). An automated filtering approach to detect outliers of these variables based on median absolute deviation above 3 was also applied. Furthermore, complexity analysis based on the expected number of genes given number of UMIs per cell was also used to filter out poor quality cells. Samples were analyzed using the Seurat workflow [(Hao et al 2021)](https://doi.org/10.1016/j.cell.2021.04.048). Each sample was normalized using the SingleCellTransform pipeline [(Hafemeister & Satija 2019)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) with method set to "glmGamPoi" [(Ahlmann-Eltze & Huber 2020)](https://academic.oup.com/bioinformatics/article/36/24/5701/6028985). 30 principal components were used for graph construction and resolution was set to 0.5 for Louvain clustering (unless otherwise specified). Marker analysis was performed using the Wilcoxon Rank Sum Test as implemented in the Seurat FindAllMarkers() function with "only.pos" set to True. To identify cell types present in each sample, the Seurat anchor-based Label Transfer method was used, based on a single-cell RNA-seq dataset from similar tissue (unless otherwise specified).


'


if(Pseudobulk_mode == T){
  
  text2 <- 'Samples were integrated using the RISC package [(Liu, Zheng et al Nat Biotech 2021)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8456427/). 30 PCs were used for integration as well as for post-clustering analysis including Louvain clustering, unless otherwise specified. Compositional analysis to compare proprotional abundance of clusters between conditions was performed via the Propeller test as implemented in the speckle package [(Phipson et al 2022)](https://academic.oup.com/bioinformatics/article/38/20/4720/6675456). Differential expression analysis across conditions followed a pseudobulking approach based on the EdgeR-Likelihood Ratio Test (LRT) method for each cluster [(Robinson et al 2010)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2796818/). Pathway analysis on the differentially expressed genes was performed using Gene Set Enrichment Analysis (GSEA) as implemented in the FGSEA package [(Subramanian et al 2005)](https://www.pnas.org/doi/10.1073/pnas.0506580102), [(Korotkevich et al 2021)](https://www.biorxiv.org/content/10.1101/060012v3). Pathways were downloaded from the Molecular Signatures Database using the R package msigdbr [(Liberzon et al 2015)](https://academic.oup.com/bioinformatics/article/27/12/1739/257711), [(Dolgalev 2022)](https://github.com/igordot/msigdbr).
'
  
}
if(Pseudobulk_mode == F){
  
  text2 <- 'Samples were integrated using the RISC package [(Liu, Zheng et al Nat Biotech 2021)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8456427/). 30 PCs were used for integration as well as for post-clustering analysis including Louvain clustering, unless otherwise specified. Compositional analysis to compare proprotional abundance of clusters between conditions was performed via the 2-sample Z test as implemented in the R prop.test() function. Differential expression analysis across conditions used the Wilcoxon test as implemented in the Seurat FindMarkers() function. Pathway analysis on the differentially expressed genes was performed using Gene Set Enrichment Analysis (GSEA) as implemented in the FGSEA package [(Subramanian et al 2005)](https://www.pnas.org/doi/10.1073/pnas.0506580102), [(Korotkevich et al 2021)](https://www.biorxiv.org/content/10.1101/060012v3). Pathways were downloaded from the Molecular Signatures Database using the R package msigdbr [(Liberzon et al 2015)](https://academic.oup.com/bioinformatics/article/27/12/1739/257711), [(Dolgalev 2022)](https://github.com/igordot/msigdbr).
'
  
}


# cat(text1); cat(text2)

alltext <- paste0(text1, '\n', text2)
cat(alltext)

Sequencing data from the 10x Chromium samples were first analyzed with Cellranger to generate cell level gene expression data (10X Genomics). Cells with fewer than 500 Unique Molecular Identifiers (UMIs) detected, fewer than 200 unique genes detected, more than 25% mitochondrial content, or more than 25% hemoglobin gene expression were filtered out (unless otherwise specified). An automated filtering approach to detect outliers of these variables based on median absolute deviation above 3 was also applied. Furthermore, complexity analysis based on the expected number of genes given number of UMIs per cell was also used to filter out poor quality cells. Samples were analyzed using the Seurat workflow (Hao et al 2021). Each sample was normalized using the SingleCellTransform pipeline (Hafemeister & Satija 2019) with method set to “glmGamPoi” (Ahlmann-Eltze & Huber 2020). 30 principal components were used for graph construction and resolution was set to 0.5 for Louvain clustering (unless otherwise specified). Marker analysis was performed using the Wilcoxon Rank Sum Test as implemented in the Seurat FindAllMarkers() function with “only.pos” set to True. To identify cell types present in each sample, the Seurat anchor-based Label Transfer method was used, based on a single-cell RNA-seq dataset from similar tissue (unless otherwise specified).

Samples were integrated using the RISC package (Liu, Zheng et al Nat Biotech 2021). 30 PCs were used for integration as well as for post-clustering analysis including Louvain clustering, unless otherwise specified. Compositional analysis to compare proprotional abundance of clusters between conditions was performed via the Propeller test as implemented in the speckle package (Phipson et al 2022). Differential expression analysis across conditions followed a pseudobulking approach based on the EdgeR-Likelihood Ratio Test (LRT) method for each cluster (Robinson et al 2010). Pathway analysis on the differentially expressed genes was performed using Gene Set Enrichment Analysis (GSEA) as implemented in the FGSEA package (Subramanian et al 2005), (Korotkevich et al 2021). Pathways were downloaded from the Molecular Signatures Database using the R package msigdbr (Liberzon et al 2015), (Dolgalev 2022).

1.2 Pipeline Parameters

plotlab <- '

### Sample names and conditions

Here we list the sample names, conditions, and code names for each sample.


'

cat(plotlab)

1.2.1 Sample names and conditions

Here we list the sample names, conditions, and code names for each sample.

knitr::kable(sample_metadata)
Sample Condition Code
H00049 Healthy Healthy_1
H00053 Healthy Healthy_2
S00006 Covid_Mild Covid_Mild_1
S00016 Covid_Mild Covid_Mild_2
S00040 Covid_Critical Covid_Critical_1
S00043 Covid_Critical Covid_Critical_2
plotlab <- '

### Comparisons to perform

We set up a cross-condition comparison using each row of the table below. For example, for each row, the "c1" column is compared with the "c2" column in differential expression analysis.



'

cat(plotlab)

1.2.2 Comparisons to perform

We set up a cross-condition comparison using each row of the table below. For example, for each row, the “c1” column is compared with the “c2” column in differential expression analysis.

knitr::kable(comps)
c1 c2
Covid_Critical Healthy
Covid_Mild Healthy
Covid_Critical Covid_Mild
plotlab <- '

### Analysis parameters

Here we list the key analysis parameters used in this analysis.



'

cat(plotlab)

1.2.3 Analysis parameters

Here we list the key analysis parameters used in this analysis.

rownames(pldf) <- NULL
knitr::kable(pldf)
parameter value
datadir /gs/gsfs0/users/aferrena/data/deyou/scDAPP/data//datadir//
outdir /gs/gsfs0/users/aferrena/data/deyou/scDAPP/data//outs/PreRunOut//
sample_metadata /gs/gsfs0/users/aferrena/data/deyou/scDAPP/data//sample_metadata.csv
comps /gs/gsfs0/users/aferrena/data/deyou/scDAPP/data//comps.csv
use_labeltransfer TRUE
refdatapath /gs/gsfs0/users/aferrena/data/deyou/scDAPP/data//labeltransferref/LabelTransferRef_SCTnormalized.rds
m_reference /gs/gsfs0/users/aferrena/data/deyou/scDAPP/data//labeltransferref/LabelTransferRefMarkers.rds
min_num_UMI 500
min_num_Feature 200
max_perc_mito 25
max_perc_hemoglobin 25
autofilter_mito TRUE
autofilter_nUMI TRUE
autofilter_complexity TRUE
autofilter_medianabsolutedev_threshold 3
autofilter_loess_negative_residual_threshold -5
doubletFinder TRUE
risc_reference auto
pcs_indi 30
res_indi 0.5
pcs_int 30
res_int 0.5
RISC_louvain_neighbors 10
Pseudobulk_mode TRUE
crossconditionDE_padj_thres 0.1
crossconditionDE_lfc_thres 0
pathway_padj_thres 0.1
species Homo sapiens
workernum 6
input_seurat_obj TRUE

2 Individual sample processing and analysis

First, we preprocess and analyze each individual sample. This will go through the following steps for each sample:

  • Preprocessing: Normalize and pre-process with the Seurat SingleCellTransform (SCT) pipeline
  • Dimension reduction and clustering: Principal component analysis, graph construction, clustering, and visualization via UMAP to discover clusters in each sample
  • Marker analysis: using the Wilcoxon test for differential expression, find unique marker genes of each cluster
  • Label transfer: from a reference single-cell dataset, quantify how much each cell in the data resembles a known cell type

After the analysis of individual samples, we will do an integrated analysis.

### load packages, set seed

library(tidyverse)
library(patchwork)  # combine plots
library(RISC)
library(Seurat)
library(scDAPP)
library(DoubletFinder)

library(future)
library(parallel)
library(foreach)

library(glmGamPoi)  # for faster SCT
library(ComplexHeatmap) # for heatmaps
library(ggdendro)       #for clustering dendrograms

library(ggridges) # qc ridgeplots

library(edgeR)

library(msigdbr)          #get pathways (cross species) from msigdb

library(hdf5r) # HARD TO INSTALL: installed thru mamba
library(ggalluvial) # part of alluvial plot
library(ggfittext) # part of alluvial plot
library(ggrepel) # part of alluvial plot


set.seed(2022)

3 Data Quality Assessment

Once sequencing is completed, we get a file that has the barcode and cDNA sequences as reads (a bunch of ATGCs), along with some sequencing quality information. This file is called a “FASTQ” and has the file extension “.fastq”. The downstream analysis relies on analyzing counts of transcripts for each cell. This means we have to do a few things:

  • Alignment: assign each sequence read to the gene it came from.

  • Gene demultiplexing: Once the read-pair is aligned to a gene, we count the UMI barcode as a count for that gene. Two reads may come from the same gene, but if they have the same UMI, it means they came from the same transcript, so it only counts at one UMI. This allows us to get around the problem of short-read sequencing inherent to Illumina technology, to count actual transcripts rather than counting reads.

  • cell demultiplexing: During 10X library prep, each cell is isolated and given a barcode. Every read from that cell is labelled with the cell barcode. Using the barcode, we assign the read to a specific cell.

These preprocessing steps are performed by the 10x Genomics software called Cellranger. Cellranger produces some outputs including web_summary.html files that allow basic exploration of the sample quality. It is good practice to review these files for each sample.

Downstream analysis, includng clustering and label transfer, works with the output of Cellranger Count. This includes a gene by cell matrix. Each row is a gene, each column is a cell and each value is the number of UMIs for each gene from each cell; typically, this is ~20K genes (rows) x ~8K cells (columns). Once that is all complete, we are ready for the analysis performed in this document. We use the Seurat pipeline in R to analyze this data, which involves the steps detailed in section 1.

Flawed datapoints can include poor quality cells and multiplets. Poor quality cells can include cells which have sustained membrane damage and leakage of their RNA, a process which typically results the observed transcriptomic profile of that cell displaying a high proportion of mitochondrial RNA, or low numbers of UMIs or unique genes. Multiplets can be caused by random error of poisson loading of single-cells in the 10X microfluidics chamber. These can influence and bias downstream analysis. Occasionally, we also sometimes observe red blood cells despite usage of RBC lysis buffer. These should not be in the data and so we remove them as well.

#set up output dir for individual samples
outdir_indi <- paste0(outdir, '/individualsample_analysis')

dir.create(outdir_indi)

### read in data ###



### use pseudobulk_md for good sample order
samples <- sample_metadata$Sample
names(samples) <- sample_metadata$Code



#read in data from filtered_feature_bc_matrix.h5 files or from Seurat object (ie for hashed)

if(input_seurat_obj == T){## read in
  sobjlist <- lapply(samples, function(samp){
    message('\nReading in ', samp)
    
    
    #for HTO, we will assume saved objects
    sobjfile <- paste0(datadir, '/', samp, '.rds')
    sobj <- readRDS(sobjfile)
    
    
    
    
    #make project name the Conditon_Sample
    md_samp <- sample_metadata[sample_metadata$Sample==samp,,drop=F]
    
    #make project name the sample
    sobj@project.name <- md_samp$Code
    
    #make orig.ident the code
    sobj$orig.ident <- md_samp$Code
    
    #return seurat obj
    sobj
    
    
  })
} else{
  
  ## read in
  sobjlist <- lapply(samples, function(samp){
    message('\nReading in ', samp)
    
    datafp <- paste0(datadir, '/', samp)
    # if on hpc, use below
    # datafp <- paste0(datadir, '/', samp, '/outs/')
    
    # for the dl data, we need to find the filepath
    h5_filename <- grep(pattern = 'filtered_feature_bc_matrix.h5',
                        list.files(datafp, recursive = T, full.names = T),
                        value = T)
    
    
    
    
    
    #read in
    sobj <- CreateSeuratObject(   Read10X_h5(h5_filename), 
                                  min.cells= 3)
    
    
    
    
    #make project name the Conditon_Sample
    md_samp <- sample_metadata[sample_metadata$Sample==samp,,drop=F]
    
    #make project name the sample
    sobj@project.name <- md_samp$Code
    
    #make orig.ident the code
    sobj$orig.ident <- md_samp$Code
    
    #return seurat obj
    sobj
    
    
  })
  
}
#add in qc values for mito and hemoglobin

sobjlist <- lapply(sobjlist, function(sobj){
  #mito content, add to metadata
  mito.features <- grep(pattern = "^mt-", x = rownames(x = sobj), value = TRUE, ignore.case = T)
  sobj[["percent.mito"]] <- Seurat::PercentageFeatureSet(sobj, features = mito.features)
  
  #hemoglobin content, add to metadata
  sobj$percent.hemoglobin <- scDAPP::calculate_percent.hemoglobin(sobj)
  
  #calculate phase (if possible, sometimes it fails)
  try(
    sobj <- CellCycleScoring(sobj,
                             s.features = Seurat::cc.genes.updated.2019$s.genes,
                             g2m.features = Seurat::cc.genes.updated.2019$g2m.genes)
  )
  
  sobj
  
})



#here we do most of the filtering 

rawobjsdir <- paste0(outdir_indi, '/unfiltered_Seurat_objects')
dir.create(rawobjsdir, recursive = T)

outdir_indi_seuratobjs <- paste0(outdir_indi, '/processed_Seurat_objects')
dir.create(outdir_indi_seuratobjs, recursive = T)

qcdir <- paste0(outdir_indi, '/qualitycontrol_filtering')
dir.create(qcdir, recursive = T)


#make a temp dir and run one at a time or parallel with foreach...
## remove seurat objects; save to tmp files instead...
qctmpdir <- paste0(qcdir, '/qctmpdir/')
dir.create(qctmpdir, recursive = T)
sobjlist <- lapply(sobjlist, function(sobj){
  code <- sobj@project.name
  tmpsobjfp <- paste0(qctmpdir, '/', code, '.rds')
  saveRDS(sobj, tmpsobjfp)
  return(sobj)
})


#clean up env

rm(sobjlist)

invisible(gc(full = T, reset = F, verbose = F))

#actual processing steps

#0. read in each sample from temp
#1. normalize and cluster raw data w/o filter
#2. apply autofilter
#3. filter out initial auto filter
#4. IF DF == T: renormalize, recluster, apply doubletfinder, refilter
#5. renormalize, recluster with real clustering parameters
#6. add all filter out annotation to raw sobj
#7. do some analysis on raw and save it
#8. save files (raw and filtered/processed)
#9. return autofilter results

cl <- parallel::makeCluster(workernum)
doParallel::registerDoParallel(cl)


codes <- sample_metadata$Code
# codes <- codes[11:17] troubleshoot particular samples



# af_md_list <- lapply(codes, function(code){
af_md_list <- foreach(code = sample_metadata$Code,
                      .packages = c('Seurat', 'ggplot2', 'dplyr',
                                    'scDAPP','grid', 'irlba', 'Matrix'), .verbose = T) %dopar%
  {
    
    
    message(code)
    
    #read in sobj 
    sobj <- readRDS(paste0(qctmpdir, '/', code, '.rds'))
    
    
    #1. normalize and cluster raw data w/o filter
    
    #normalize and cluster
    suppressWarnings(sobj <- Seurat::SCTransform(sobj, verbose = T, method="glmGamPoi"))
    
    sobj <- Seurat::RunPCA(object = sobj, verbose = F)
    
    sobj <- Seurat::FindNeighbors(object = sobj, dims = 1:30, verbose = F)
    sobj <- Seurat::FindClusters(object = sobj, resolution = 0.1, verbose = F, algorithm = 1)
    
    sobj <- Seurat::RunUMAP(sobj, dims = 1:30)
    
    
    
    
    #2. apply autofilter 
    af <- scDAPP::autofilter(sobj, 
                           min_num_UMI = min_num_UMI,
                           min_num_Feature = min_num_Feature,
                           max_perc_mito = max_perc_mito,
                           max_perc_hemoglobin = max_perc_hemoglobin,
                           globalfilter.complexity = autofilter_complexity,
                           globalfilter.mito = autofilter_mito,
                           globalfilter.libsize = autofilter_nUMI,
                           mad.score.threshold = autofilter_medianabsolutedev_threshold,
                           loess_negative_residual_threshold = autofilter_loess_negative_residual_threshold
    )
    
    
    
    
    
    #3. filter out initial auto filter 
    
    #name unfiltered object as sobjraw, and filtered as sobj for now and sobjsave after
    
    sobjraw <- sobj
    
    
    cellstatus <- af$cellstatus
    
    goodcells <- cellstatus[cellstatus$filteredout==F,"barcodes"]
    
    sobj <- sobj[,goodcells]
    
    
    
    
    #4. IF DF == T: renormalize, recluster, apply doubletfinder, refilter
    
    if(doubletFinder == T){
      #filter, re-proc
      
      
      ### temporary fix for doubletfinder v2.0.3 and Seurat v5: coerce seurat to "v3" instead of v5 object
      if( (packageVersion('DoubletFinder') == "2.0.3") & (packageVersion('Seurat') >= "5.0.0") ){
        
        ## try to make it a old seurat object...
        sobj_df <- GetAssayData(sobj, assay = 'SCT', layer = 'data')
        sobj_df <- CreateAssayObject(sobj_df)
        sobj_df <- CreateSeuratObject(sobj_df)
        
        warning('Will attempt to coerce v5 Seurat object to work with DoubletFinder v2.0.3; this is unstable and does not always work! If any errors arise, set doubletFinder to FALSE in pipeline runner')
        
      } else{sobj_df <- sobj}
      
      
      #normalize and cluster
      suppressWarnings(sobj_df <- Seurat::SCTransform(sobj_df, verbose = T, method="glmGamPoi"))
      
      sobj_df <- Seurat::RunPCA(object = sobj_df, verbose = F)
      
      sobj_df <- Seurat::FindNeighbors(object = sobj_df, dims = 1:30, verbose = F)
      sobj_df <- Seurat::FindClusters(object = sobj_df, resolution = 0.1, verbose = F, algorithm = 1)
      
      sobj_df <- RunUMAP(sobj_df, dims = 1:30)
      
      
      try(expr = {
        ### run DF
        af <- scDAPP::doubletfinderwrapper(sobj_df, 
                                         autofilterres = af, 
                                         num.cores = 1)
        
        
        
        
        #filter, re-proc
        cellstatus <- af$cellstatus
        
        goodcells <- cellstatus[cellstatus$filteredout==F,"barcodes"]
        
        sobj <- sobj[,goodcells]
      })
      
      rm(sobj_df); gc(full = T)
      
      
      
      
    }
    
    
    
    
    
    
    #previously we did main analysis here, for ease of code reading
    # we'll do it later in the clustering section
    
    # MAKE SURE RAW PREFILT CLUSTERS ARE LABELLED APPROPRIATELY IN SOBJSAVE 
    
    colnames(sobj@meta.data)[grepl('SCT_snn_res.0.1', colnames(sobj@meta.data))] <- 'PREFILTER_SCT_snn_res.0.1'
    
    #name filtered object as sobjsave
    sobjsave <- sobj ; rm(sobj)
    
    
    
    #7. do some analysis on raw and save it
    
    #do a bit of analysis
    #add af cell status to sobjraw md
    sobjraw@meta.data <- cbind(sobjraw@meta.data, af$cellstatus[,-1])
    
    #find markers; do not futurize, it breaks everything
    
    m <- FindAllMarkers(sobjraw, only.pos = T)
    # as of Nov 9 2023 (Seurat v5): add score to markers
    m$score <- (m$pct.1 - m$pct.2) * m$avg_log2FC
    
    
    #prep genes
    n <- 5
    top <- m %>% group_by(cluster) %>% top_n(n = n, wt = score)
    
    #make some plots
    d_rawclust <- DimPlot(sobjraw, group.by = 'seurat_clusters', label = T, repel = T)+ggtitle('Unfiltered data clusters', subtitle = 'Louvain res = 0.1')
    
    d_raw_filt <- DimPlot(sobjraw, group.by = 'filteredout', label = F, repel = T)
    
    
    sobjraw$filterreason <- factor(sobjraw$filterreason, levels = names(sort(table(sobjraw$filterreason), decreasing = T)))
    d_raw_filt_reason <- DimPlot(sobjraw, group.by = 'filterreason', label = F, repel = T)
    
    fp_raw_qc <- FeaturePlot(sobjraw, c('nCount_RNA', 'nFeature_RNA', 
                                        'percent.mito', 'percent.hemoglobin'),
                             order = T)
    
    #prep per-cluster filter numbers
    tab_filt_by_clust <- table(sobjraw$filterreason, sobjraw$seurat_clusters)
    tab_filt_by_clust <- t(tab_filt_by_clust)
    rownames(tab_filt_by_clust) <- paste0('cluster_', rownames(tab_filt_by_clust))
    colnames(tab_filt_by_clust) <- gsub(x = colnames(tab_filt_by_clust),
                                        pattern = '\\.', '\n')
    
    
    
    hm_raw <- DoHeatmap(sobjraw, top$gene, raster = F)+NoLegend() + labs(title = "Pre-filter cluster markers")
    
    # d_filt_clust <- DimPlot(sobjraw, group.by = newclustname, label = T, repel = T)+ggtitle('Filtered data clusters')
    
    
    #alluvial plot: prep colors and make sure order is hi to lo
    sobjraw$filterreason <- factor(sobjraw$filterreason, levels = names(sort(table(sobjraw$filterreason), decreasing = T)))
    pal <- grDevices::colorRampPalette(RColorBrewer::brewer.pal('Dark2', n = 8))(length(levels(sobjraw$seurat_clusters))) 
    
    
    ap_filt <- alluvialplot(sobjraw@meta.data[,c('seurat_clusters', 'filteredout')])+
      scale_fill_manual(values = pal)+
      labs(title = 'Cluster filtering')
    
    ap_filt_reason <- alluvialplot(sobjraw@meta.data[,c('seurat_clusters', 'filterreason')])+
      scale_fill_manual(values = pal)+
      labs(title = 'Cluster filtering reason')
    
    
    #add some basic filter vln plots
    comm <- af$allcommands
    rownames(comm) <- comm$Command
    
    af$vln_umi <- VlnPlot(sobjraw, 'nCount_RNA', group.by = 'orig.ident')+
      scale_y_log10(labels = scales::label_comma())+
      geom_hline(yintercept = comm['min_num_UMI', 2], 
                 linetype = 'dotted')+
      labs(caption = paste0("cutoff = ", comm['min_num_UMI', 2]))
    
    af$vln_feature <- VlnPlot(sobjraw, 'nFeature_RNA', group.by = 'orig.ident')+
      scale_y_log10(labels = scales::label_comma())+
      geom_hline(yintercept = comm['min_num_Feature', 2], 
                 linetype = 'dotted')+
      labs(caption = paste0("cutoff = ", comm['min_num_Feature', 2]))
    
    
    af$vln_mito <- VlnPlot(sobjraw, 'percent.mito', group.by = 'orig.ident')+
      geom_hline(yintercept = comm['max_perc_mito', 2], 
                 linetype = 'dotted')+
      labs(caption = paste0("cutoff = ", comm['max_perc_mito', 2]))
    
    af$vln_hemo  <- VlnPlot(sobjraw, 'percent.hemoglobin', group.by = 'orig.ident')+
      geom_hline(yintercept = comm['max_perc_hemoglobin', 2], 
                 linetype = 'dotted')+
      labs(caption = paste0("cutoff = ", comm['max_perc_hemoglobin', 2]))
    
    
    #add to autofilter
    af$d_rawclust <- d_rawclust
    af$d_raw_filt <- d_raw_filt
    af$d_raw_filt_reason <- d_raw_filt_reason
    
    af$fp_raw_qc <- fp_raw_qc
    
    af$tab_filt_by_clust <- tab_filt_by_clust
    
    af$hm_raw <- hm_raw
    #af$d_filt_clust <- d_filt_clust
    
    
    af$ap_filt <- ap_filt
    af$ap_filt_reason <- ap_filt_reason
    
    
    #change colnames for baseline sumary
    colnames(af$baseline_qc_summary) <- gsub("summary_", "summary\n", colnames(af$baseline_qc_summary))
    
    
    
    #save it all
    
    
    # save the raw objects
    saveRDS(sobjraw, paste0(rawobjsdir, '/Unfiltered-SeuratObject-', code, '.rds'))
    
    #save the autofilter as a nice pdf
    afpdf <- paste0(qcdir, '/QC_autofilter_summary-', code,'.pdf')
    
    pdf(afpdf, height = 10, width = 10)
    
    
    
    pdftable(af$filtersummary, title = 'Cell Filtering Summary')
    
    pdftable(af$allcommands, title = 'Filter parameters')
    
    pdftable(round(af$baseline_qc_summary, 2), title = 'QC summary stats')
    
    
    
    
    print(af$vln_umi)
    print(af$vln_feature)
    print(af$vln_mito)
    print(af$vln_hemo)
    
    print(af$globalfilter.complexity)
    print(af$globalfilter.libsize)
    print(af$globalfilter.mito)
    
    
    print(af$d_rawclust)
    print(af$d_raw_filt)
    print(af$d_raw_filt_reason)
    print(af$fp_raw_qc)
    
    
    pdftable(af$tab_filt_by_clust, title = 'Cell filtering per cluster')
    
    print(af$hm_raw)
    
    # print(af$d_filt_clust)
    
    print(af$ap_filt)
    print(af$ap_filt_reason)
    
    
    
    
    dev.off()
    
    
    #8. save procesed object
    #code <- sobj@project.name
    sobjfile <- paste0(qctmpdir, '/', code, '.rds')
    
    saveRDS(sobjsave, sobjfile)
    
    
    #9. return autofilter and raw md
    rawmd <- sobjraw@meta.data
    rm(sobjsave, sobjraw)
    invisible(gc(full = T, reset = F, verbose = F))
    
    
    list(af, rawmd)
    
    
  } # for foreach
# }) # for lapply



parallel::stopCluster(cl)


#recover the autofilter and metadata lists
aflist <- lapply(af_md_list, function(subl){
  subl[[1]]
})

mdlist <-  lapply(af_md_list, function(subl){
  subl[[2]]
})

names(aflist) <- sample_metadata$Code
names(mdlist) <- sample_metadata$Code

#clean mem
rm(af_md_list)
invisible(gc(full = T, reset = F, verbose = F))
### prepare some summary plots for filtering


# #from autofilter list, get baseline summaries and cell filter stats
cn <- colnames(aflist[[1]]$baseline_qc_summary)

bsl <- lapply(cn, function(var){
  
  sampsum <- lapply(1:length(aflist), function(i){
    
    af <- aflist[[i]]
    samp <- names(aflist)[i]
    sampsum <- af$baseline_qc_summary[,var,drop = F]
    colnames(sampsum) <- samp
    
    sampsum
    
    
  })
  
  sampsum <- dplyr::bind_cols(sampsum)
  sampsum <- t(sampsum)
  
})

cn <- gsub(x=cn, 'summary\n', '')
names(bsl) <- cn



#from outlier list, filter summary
fs <- lapply(1:length(aflist), function(i){
  af <- aflist[[i]]
  samp = names(aflist)[i]
  sum <- af$filtersummary
  rownames(sum) <- sum[,1]
  sum <- sum[,2, drop = F]
  colnames(sum) <- samp
  sum
})

#sometimes, some samples don't have any removed for some category ,so need to fix
remnames <- unique(unlist(lapply(fs, rownames)))

fs <- lapply(fs, function(sum){
  
  #identify missing filters and set to 0
  if( any(!(remnames %in% rownames(sum))) ){
    missingfilts <- remnames[!(remnames %in% rownames(sum))]
    adddf <- data.frame(samp = rep(0, length(missingfilts)))
    colnames(adddf) <- colnames(sum)
    rownames(adddf) <- missingfilts
    sum <- rbind(sum, adddf)
  }
  
  
  #make sure all have same order
  sum <- sum[match(remnames, rownames(sum)),,drop=F]
  
})


#bind table
fs <- dplyr::bind_cols(fs)
fs <- t(fs)

#shorten names
colnames(fs) <- gsub('globalfilter', 'auto', colnames(fs))
colnames(fs) <- gsub('DoubletFinder_doublet', 'Doublet', colnames(fs))

#rearragne, total cells, unfilt, then filtered cells
fs <- cbind( fs[,c(ncol(fs), ncol(fs)-1)], fs[,1:(ncol(fs)-2)] )


## also prep some summary plots for each sample ##

### pre-filt plots

combmd <- dplyr::bind_rows(mdlist)


umilims <- c(min(combmd$nCount_RNA), max(combmd$nCount_RNA)) 
featlims <- c(min(combmd$nFeature_RNA), max(combmd$nFeature_RNA))

### ridgeplots / ridgeline density plots


# #density plot for nCount_RNA
var <- "nCount_RNA"
submd <- combmd[,c(var,'orig.ident')]
colnames(submd)[1] <- 'var'
submd$var <- log10(submd$var)
maxdens <- aggregate(var ~ orig.ident, submd, function(x){max(density(x)$y)})

# submd <- combmd[,c(var,'orig.ident')]
# colnames(submd)[1] <- 'var'
# repelcoords <- aggregate(var ~ orig.ident, submd, median)
# repelcoords$maxdens <- maxdens$var
# 
# dens_UMI <- ggplot(combmd, aes(x = .data[[var]], col = orig.ident))+
#   geom_density()+
#   ggrepel::geom_text_repel(inherit.aes = F,
#                            data = repelcoords,
#                            aes(x = var, y = maxdens, label = orig.ident, color = orig.ident))+
#   scale_x_log10(labels = scales::label_comma(), name = var)

dens_UMI <- ggplot(combmd, aes(x = .data[[var]], y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var, limits = umilims)+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')

# 
# 
# #density plot for nFeature_RNA
var <- "nFeature_RNA"
submd <- combmd[,c(var,'orig.ident')]
colnames(submd)[1] <- 'var'
submd$var <- log10(submd$var)
maxdens <- aggregate(var ~ orig.ident, submd, function(x){max(density(x)$y)})

# submd <- combmd[,c(var,'orig.ident')]
# colnames(submd)[1] <- 'var'
# repelcoords <- aggregate(var ~ orig.ident, submd, median)
# repelcoords$maxdens <- maxdens$var
# 
# dens_feature <- ggplot(combmd, aes(x = .data[[var]], col = orig.ident))+
#   geom_density()+
#   ggrepel::geom_text_repel(inherit.aes = F,
#                            data = repelcoords,
#                            aes(x = var, y = maxdens, label = orig.ident, color = orig.ident))+
#   scale_x_log10(labels = scales::label_comma(), name = var)


dens_feature <- ggplot(combmd, aes(x = .data[[var]], y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var, limits = featlims)+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')
# 
# 
# #density plot for mito
var <- "percent.mito"
submd <- combmd[,c(var,'orig.ident')]
colnames(submd)[1] <- 'var'
maxdens <- aggregate(var ~ orig.ident, submd, function(x){max(density(x)$y)})

# submd <- combmd[,c(var,'orig.ident')]
# colnames(submd)[1] <- 'var'
# repelcoords <- aggregate(var ~ orig.ident, submd, median)
# repelcoords$maxdens <- maxdens$var
# 
# dens_mito <- ggplot(combmd, aes(x = .data[[var]]+0.1, col = orig.ident))+
#   geom_density()+
#   ggrepel::geom_text_repel(inherit.aes = F,
#                            data = repelcoords,
#                            aes(x = var, y = maxdens, label = orig.ident, color = orig.ident))+
#   scale_y_continuous(name = 'density')


dens_mito <- ggplot(combmd, aes(x = .data[[var]]+1, y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var, limits = c(1,100))+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')
# 
# 
# #density plot for hemoglobin
var <- "percent.hemoglobin"
submd <- combmd[,c(var,'orig.ident')]
colnames(submd)[1] <- 'var'
maxdens <- aggregate(var ~ orig.ident, submd, function(x){max(density(x)$y)})

# submd <- combmd[,c(var,'orig.ident')]
# colnames(submd)[1] <- 'var'
# repelcoords <- aggregate(var ~ orig.ident, submd, median)
# repelcoords$maxdens <- maxdens$var
# 
# dens_hemo <- ggplot(combmd, aes(x = .data[[var]]+0.01, col = orig.ident))+
#   geom_density()+
#   ggrepel::geom_text_repel(inherit.aes = F,
#                            data = repelcoords,
#                            aes(x = var, y = maxdens, label = orig.ident, color = orig.ident))+
#   scale_y_continuous(name = 'density')

dens_hemo <- ggplot(combmd, aes(x = .data[[var]]+1, y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var,limits = c(1,100))+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')




## also prep some summary plots for each sample ##

### post-fld plots

combmd <- dplyr::bind_rows(mdlist)

combmd <- combmd[combmd$filteredout == F,]

### ridgeplots / ridgeline density plots

# #density plot for nCount_RNA
var <- "nCount_RNA"

post_dens_UMI <- ggplot(combmd, aes(x = .data[[var]], y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var,
                limits = umilims )+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')


# #density plot for nFeature_RNA
var <- "nFeature_RNA"


post_dens_feature <- ggplot(combmd, aes(x = .data[[var]], y=orig.ident, fill=after_stat(log10(x))) )+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var,
                limits = featlims )+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')

# #density plot for mito
var <- "percent.mito"

post_dens_mito <- ggplot(combmd, aes(x = .data[[var]]+1, y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var,
                limits = c(1,100) )+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')


# #density plot for hemoglobin
var <- "percent.hemoglobin"

post_dens_hemo <- ggplot(combmd, aes(x = .data[[var]]+1, y=orig.ident, fill=after_stat(log10(x))))+
  geom_density_ridges_gradient(scale=0.9, stat = "binline", bins = 50) +
  scale_x_log10(labels = scales::label_comma(), name = var,
                limits = c(1,100) )+
  scale_y_discrete(limits = rev(sample_metadata$Code))+
  viridis::scale_fill_viridis( option = "C", alpha = 0.7) +
  theme_ridges()+theme(legend.position = 'none')







### prepare a whole dataset qc summary pdf

# adjust filter summary table colnames, too long
colnames(fs) <- gsub('auto.', '', colnames(fs))

allsampfiltsumm <- paste0(qcdir, '/AllSamples_QC_Summary.pdf')


#add plot titles

dens_UMI <-  dens_UMI + ggtitle('Pre-Filter UMI distribution plot')   
post_dens_UMI <-  post_dens_UMI + ggtitle('Post-Filter UMI distribution plot')   

dens_feature <-  dens_feature + ggtitle('Pre-Filter Unique Gene distribution plot')   
post_dens_feature <- post_dens_feature + ggtitle('Post-Filter Unique Gene distribution plot')   

dens_mito <-  dens_mito + ggtitle('Pre-Filter Percent Mito distribution plot')   
post_dens_mito <-  post_dens_mito + ggtitle('Post-Filter Percent Mito distribution plot')

dens_hemo <-   dens_hemo + ggtitle('Pre-Filter Percent Hemoglobin distribution plot')   
post_dens_hemo <- post_dens_hemo + ggtitle('Post-Filter Percent Hemoglobin distribution plot')   





pdf(allsampfiltsumm, height = 7, width = 7)


print(pdftable(fs, title = "Cell Filter Summary (cell numbers)"))


print( pdftable( round(bsl$nCount_RNA,1) , title = 'Pre-Filter UMI distribution') )

print(  dens_UMI   )
print(  post_dens_UMI   )


print( pdftable( round(bsl$nFeature_RNA,1) , title = 'Pre-Filter Unique Gene distribution') )

print(  dens_feature   )
print(  post_dens_feature  )



print( pdftable( round(bsl$perc.mito,1) , title = 'Pre-Filter Percent Mito distribution') )

print(  dens_mito  )
print(  post_dens_mito   )



print( pdftable( round(bsl$perc.hemoglobin,1) , title = 'Pre-Filter Percent Hemoglobin distribution') )

print(  dens_hemo   )
print(  post_dens_hemo   )

dev.off()

3.1 Filter out poor quality cells

Sometimes poor quality cells can occur in the data due to cell membrane damage during sample preparation, etc. This can bias the downstream analysis.

cat("\n\n

Below we show the number of cells removed with filtering. BasicFilter refers to cells failing a minimum threshold: by default, cells must have >= 500 UMIs, 200 unique genes, and <= 25% mitochondria and hemoglobin content.

Auto.complexity refers to outliers from a regression analysis modelling number of genes by number of UMIs, or the 'complexity' of the cell. Specifically we model the log of each of these, where the relationship is very close to linear. We use a double-regression strategy of both linear and loess regression, and outlier cells must have both a high linear regression Cook's distance and very low Loess negative residuals. This means that cells with a lower than expected number of genes given the number of UMIs are filtered out. Typically, this captures poor quaity cells of extreme low-complexity realy cells such as RBCs.

Auto.libsize refers to cells identified as very low outliers based on median absolute deviation from the general distribtuion of UMIs. Auto.mito refers to cells identified as high outliers based on median absolute deviation from the general distribtuion of mitochondrial content.")

Below we show the number of cells removed with filtering. BasicFilter refers to cells failing a minimum threshold: by default, cells must have >= 500 UMIs, 200 unique genes, and <= 25% mitochondria and hemoglobin content.

Auto.complexity refers to outliers from a regression analysis modelling number of genes by number of UMIs, or the ‘complexity’ of the cell. Specifically we model the log of each of these, where the relationship is very close to linear. We use a double-regression strategy of both linear and loess regression, and outlier cells must have both a high linear regression Cook’s distance and very low Loess negative residuals. This means that cells with a lower than expected number of genes given the number of UMIs are filtered out. Typically, this captures poor quaity cells of extreme low-complexity realy cells such as RBCs.

Auto.libsize refers to cells identified as very low outliers based on median absolute deviation from the general distribtuion of UMIs. Auto.mito refers to cells identified as high outliers based on median absolute deviation from the general distribtuion of mitochondrial content.

knitr::kable(fs)
BasicFilter Total libsize mito Unfiltered
Healthy_1 0 800 24 26 750
Healthy_2 1 800 37 20 742
Covid_Mild_1 2 800 25 17 756
Covid_Mild_2 2 707 16 22 667
Covid_Critical_1 1 684 3 31 649
Covid_Critical_2 1 800 7 48 744
cat('\n\n
    
Here we show details of the filtering approach, including the minimum UMI and unique gene cutoffs, maximum percent mito and percent hemoglobin cutoffs. Additionally, we show paramters for the sample-wise cutoffs including median absolute deiation (mad) score threshold, and loess residual threshold. These are used to make tighter cutoffs in an automated, sample-by-sample basis. The actual sample wise cutoffs are provided for each sample in a report in the QC folder called "qualitycontrol_filtering".')

Here we show details of the filtering approach, including the minimum UMI and unique gene cutoffs, maximum percent mito and percent hemoglobin cutoffs. Additionally, we show paramters for the sample-wise cutoffs including median absolute deiation (mad) score threshold, and loess residual threshold. These are used to make tighter cutoffs in an automated, sample-by-sample basis. The actual sample wise cutoffs are provided for each sample in a report in the QC folder called “qualitycontrol_filtering”.

knitr::kable( aflist[[1]]$allcommands )
Command Option
mad.score.threshold 3
loess_negative_residual_threshold -5
min_num_UMI 500
min_num_Feature 200
max_perc_mito 25
max_perc_hemoglobin 25
globalfilter.complexity 1
globalfilter.libsize 1
globalfilter.mito 1

3.1.1 UMI distribution and filtering

Unique Molecular Identifiers (UMIs) are sequence labels that are attached to each transcript during library prep, which allow for counting unique mRNA transcripts in 10X data.

cat('Here we show the summary statistics for the distributions of UMIs in each sample before filtering.')

Here we show the summary statistics for the distributions of UMIs in each sample before filtering.

knitr::kable(bsl$nCount_RNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
Healthy_1 512 2315.00 3052.5 3359.045 4054.00 19994
Healthy_2 497 2920.25 3890.5 4200.469 4972.50 17778
Covid_Mild_1 451 2217.25 2874.5 3196.146 3814.75 41783
Covid_Mild_2 486 2156.00 2932.0 3312.269 3847.00 44034
Covid_Critical_1 487 2631.75 3947.5 4475.493 5612.75 39160
Covid_Critical_2 461 2262.50 3287.5 4342.238 4559.50 54956
cat('#### UMI pre-filter distribution
    
Here we plot the distribution of UMIs per cell for all samples before filtering.')

3.1.1.1 UMI pre-filter distribution

Here we plot the distribution of UMIs per cell for all samples before filtering.

print(dens_UMI)

cat('#### UMI post-filter distribution
    
Here we plot the distribution of UMIs per cell for all samples after filtering.')

3.1.1.2 UMI post-filter distribution

Here we plot the distribution of UMIs per cell for all samples after filtering.

print(post_dens_UMI)

3.1.2 Unique feature (gene) distribution and filtering

cat('Here we show the summary statistics for the distributions of genes (or "features", as they are reffered to in Seurat and machine learning jargon) in each sample before filtering.')

Here we show the summary statistics for the distributions of genes (or “features”, as they are reffered to in Seurat and machine learning jargon) in each sample before filtering.

knitr::kable(bsl$nFeature_RNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
Healthy_1 331 915.75 1103.5 1150.872 1319.25 2994
Healthy_2 346 1150.75 1371.5 1407.696 1618.00 3550
Covid_Mild_1 303 826.50 1046.5 1081.787 1259.25 5834
Covid_Mild_2 303 950.00 1202.0 1241.434 1457.00 5096
Covid_Critical_1 302 919.00 1204.5 1288.404 1553.25 4571
Covid_Critical_2 339 882.00 1165.5 1338.001 1549.75 6095
cat('#### Unique Feature pre-filter distribution

Here we plot the distribution of features per cell for all samples before filtering.')

3.1.2.1 Unique Feature pre-filter distribution

Here we plot the distribution of features per cell for all samples before filtering.

print(dens_feature)

cat('#### Unique Feature post-filter distribution
    
Here we plot the distribution of features per cell for all samples after filtering.')

3.1.2.2 Unique Feature post-filter distribution

Here we plot the distribution of features per cell for all samples after filtering.

print(post_dens_feature)

3.1.3 Mitochondrial content level filtering

The percent of mitochondrial content is used as a metric of cell quality in single-cell data. If cells are damaged during handling, they often sustain membrane tearing, which causes cytoplasmic RNA to leak, while mitochondria and mitochondrial RNA is retained. Thus, poor quality, damaged cells often have enriched mt-RNA.

cat('Here we show the summary statistics for the distributions of mitochondrial content in each sample before filtering.')

Here we show the summary statistics for the distributions of mitochondrial content in each sample before filtering.

knitr::kable(bsl$perc.mito)
Min. 1st Qu. Median Mean 3rd Qu. Max.
Healthy_1 0.0787608 2.7055380 3.576703 3.754190 4.527047 9.525773
Healthy_2 0.0572082 2.3795023 3.099182 3.325760 3.961888 9.483248
Covid_Mild_1 0.1339457 2.4872762 3.381768 3.670444 4.579035 9.745982
Covid_Mild_2 0.3158310 1.4212632 2.255410 2.610707 3.327504 9.384615
Covid_Critical_1 0.0000000 1.5955738 2.278323 2.567974 3.159081 8.747220
Covid_Critical_2 0.0000000 0.9512908 1.452882 1.883709 2.310267 9.349956
cat('#### Mito pre-filter distribution
    
Here we plot the distribution of mitochondrial content per cell for all samples before filtering.')

3.1.3.1 Mito pre-filter distribution

Here we plot the distribution of mitochondrial content per cell for all samples before filtering.

print(dens_mito)

cat('#### Mito post-filter distribution
    
Here we plot the distribution of mitochondrial content per cell for all samples after filtering.')

3.1.3.2 Mito post-filter distribution

Here we plot the distribution of mitochondrial content per cell for all samples after filtering.

print(post_dens_mito)

3.1.4 Hemoglobin content level filtering

Hemoglobin is expressed strongly but not exclusively by red blood cells (RBCs), which are typically removed during data generation by RBC lysis buffer, because they are extremely numerous and can overwhelm the tissue cells of interest. However, many samples will often still contain some RBCs (or many, if sample quality is poor). Removing them allows focusing in on the cells of interest.

cat('Here we show the summary statistics for the distributions of hemoglobin content in each sample before filtering.')

Here we show the summary statistics for the distributions of hemoglobin content in each sample before filtering.

knitr::kable(bsl$perc.hemoglobin)
Min. 1st Qu. Median Mean 3rd Qu. Max.
Healthy_1 0 0 0 0.0015999 0 0.2697842
Healthy_2 0 0 0 0.0013343 0 0.0652316
Covid_Mild_1 0 0 0 0.0013297 0 0.1336898
Covid_Mild_2 0 0 0 0.0053219 0 0.3181336
Covid_Critical_1 0 0 0 0.0013056 0 0.0634921
Covid_Critical_2 0 0 0 0.0007167 0 0.0833333
cat('#### Hemoglobin pre-filter distribution
    
Here we plot the distribution of hemoglobin content per cell for all samples before filtering.')

3.1.4.1 Hemoglobin pre-filter distribution

Here we plot the distribution of hemoglobin content per cell for all samples before filtering.

print(dens_hemo)

cat('#### Hemoglobin post-filter distribution
    
Here we plot the distribution of hemoglobin content per cell for all samples after filtering.')

3.1.4.2 Hemoglobin post-filter distribution

Here we plot the distribution of hemoglobin content per cell for all samples after filtering.

print(post_dens_hemo)

4 Processing and analysis for each sample

As stated above, below we detail the analysis for each sample, which includes:

  • Preprocessing: Normalize and scale gene expression with the Seurat SingleCellTransform (SCT) pipeline

  • Dimension reduction: Principal component analysis, graph construction, clustering, and visualization via UMAP. Ideally, this step requires some hyperparameter selection, including selecting the number of PCs and Louvain clustering resolution

  • Marker analysis: using a differential expression test, find the unique marker genes of each cluster

  • Label transfer: from a reference single-cell dataset, quantify how much each cell resembles a known cell type from a reference dataset, such as one derived from a paper or single-cell database

4.1 Select number of Principal Components (PCs) to use based on elbow plot

Principal Component Analysis (PCA) is important for downstream analysis including clustering and visualization with non-linear dimension reduction such as UMAP.

PCA finds high-dimensional planes which vary strongly across the cells. Essentially, each PC consists of very highly correlated genes. The first PC specifically can be thought of as a list of genes that drive together drive the most variance across cells. Each following PC has less variance.

PCA is used to help de-noising the data for downstream tasks like clustering. Individual genes can be noisy, but groups of correlated genes are less noisy.

An important caveat of PCA is that it is a linear method, in that the PC axes it finds are straight lines. This is okay for denoising genes but can often fail to capture the complexity of single-cell RNA-seq and other types of high dimensional genomic datasets. This is why the analysis does not stop at PCA but includes other more complex non-linear methods described below.

One important parameter for downstream analysis is the selection of how many PCs to use. We want to select the PCs that explain a sufficient amount of variance in the data. One way to do that is via “elbow plots” of each PC versus the standard deviation. The cutoff is made at the PC at which the SD stabilizes and becomes horizontal.

Including too few PCs can mean missing important sources of variations downstream and may result in for example, cell types being merged together in a single cluster. Conversely, including too many PCs can introduce noise into the data and result in clusters not easily explained by biology.

By default, we set the number of PCs to use as 30, which will be appropriate for most samples and is recommended as a default by the developers of SingleCellTransform. If the “Elbow” in the elbow plots strongly deviates from this, a different value may be selected and the analysis can be rerun later.

#clean up env
rm(aflist, bsl, combmd, 
   dens_feature, post_dens_feature,
   dens_hemo, post_dens_hemo,
   dens_mito, post_dens_mito,
   dens_UMI, post_dens_UMI,
   fs, maxdens, mdlist,
   submd, featlims, umilims)

invisible(gc(full = T, reset = F, verbose = F))


#test if force_redo is T or if saved sobj result does not exist
## if test == T, do it, if not skip it
### WILL IMPLEMENT LATER


### may implement later: read in serial, process parallel?
# readin serial, split up according to num workers 
# https://stackoverflow.com/questions/3318333/split-a-vector-into-chunks
# x = sample_metadata$code; split(x, ceiling(seq_along(x)/workernum))

# force_redo_test <- T




#read in data and run PCA

sobjlist <- lapply(sample_metadata$Code, function(code){
  
  sobjfile <- paste0(qctmpdir, '/', code, '.rds')
  # sobjfile <- paste0(outdir_indi_seuratobjs, '/SeuratObject-', code, '.rds')
  
  sobj <- readRDS(sobjfile)
  
  DefaultAssay(sobj) <- 'RNA'
  
  sobj <- SCTransform(sobj, assay = "RNA", verbose = verbose, method = 'glmGamPoi', vst.flavor='v2')
  
  sobj <- RunPCA(sobj, assay = "SCT", verbose = verbose)
  
  sobj
  
})

names(sobjlist) <- sample_metadata$Code

# #read from tmp dir
# sobjfile <- paste0(qctmpdir, '/SeuratObject-', code, '.rds')

# #remove tmp dir
unlink(qctmpdir, recursive = T)



## check elbow plot

elbowplots <- lapply(sobjlist, function(sobj){
  
  ElbowPlot(sobj, ndims = 50) + ggtitle(sobj@project.name)
  
})
#plot one at a time rather than side by side, label was getting cut off
#patchwork::wrap_plots(elbowplots)

for(i in 1:length(elbowplots)){
  print(elbowplots[[i]] + geom_vline(xintercept = pcs_indi, linetype = 'dotted', color = 'red'))
}

4.2 Clustering

Once PCA is completed and the number of critical PCs is selected, we perform a number of steps that all involve grouping cells together based on shared transcriptomic patterns:

  • k-nearest neighbor graph construction

  • Louvain clustering

  • Non-linear dimensionality reduction for visualization, such as t-SNE or UMAP

This process allows us to group cells together based on transcriptomic similarity. Louvain clustering optionally allows for input of a hyperparameter called “resolution,” with high resolution finding larger numbers of clusters. By default, we set this value to 0.5.

#clean env
rm(elbowplots)
invisible(gc(full = T, reset = F, verbose = F))


# ## using ndims_vec, continue individual sample pre-processing.
# 
# Calculate graph, clustering, and umap.

sobjlist <- lapply(sobjlist, function(sobj){
  
  # ndim_sample <- ndims_vec[sobj@project.name] #sample-wise PC selection
  ndim_sample <- pcs_indi
  
  sobj <- FindNeighbors(sobj, reduction = "pca", dims = c(1:ndim_sample), verbose = verbose)
  sobj <- FindClusters(sobj, verbose = verbose, resolution = res_indi)
  
  # #add 3d UMAP first
  # sobj <- Seurat::RunUMAP(sobj, reduction = "pca", dims = c(1:ndim_sample),
  #                         n.components = 3, 
  #                         reduction.name = 'UMAP3D', reduction.key = 'UMAP3D_')
  #add normal UMAP
  sobj <- RunUMAP(sobj, reduction = "pca", dims = c(1:ndim_sample), verbose = verbose)
  
  sobj
  
  
})

# ```




# ```{r markers_individualsamps_clusters}


# ## calulcate markers for individual sample clusters
# 
# This is parallelized with Future as per Seurat recommendations.
# Future multisession mode.

#turn off future, it seems to break things

# future::plan('multisession', workers=workernum)




# save markers
indimarkerdir <- paste0(outdir_indi, '/individualsample_clustermarkers/')

sampmarkersave <- paste0(indimarkerdir, '/markers-PCs_', pcs_indi, '-res_', res_indi, '/' )
dir.create(sampmarkersave, recursive = T)




mlist_individualsamples_clusters <- lapply(sample_metadata$Code, function(code){
  
  
  sampmarkersave_eachsamp <- paste0(sampmarkersave, code, '_clustermarkers.csv')
  
  #turn off file exists read in, too dangerous
  # if(!file.exists(sampmarkersave_eachsamp)){
  #   
  sobj <- sobjlist[[code]]
  
  m <- FindAllMarkers(sobj, only.pos = T, verbose = verbose)
  # as of Nov 9 2023 (Seurat v5): add score to markers
  m$score <- (m$pct.1 - m$pct.2) * m$avg_log2FC
  
  
  write.csv(m, sampmarkersave_eachsamp, quote = F, row.names = F)
  #   
  # } else{
  # m <- read.csv(sampmarkersave_eachsamp)
  
  
  # }
  
  return(m)
  
  
  
})

names(mlist_individualsamples_clusters) <- sample_metadata$Code



# future::plan(strategy = 'sequential')
if(use_labeltransfer == T){
  
  
  text <- '## Label transfer from reference scRNA-seq data

With a reference single-cell RNA-seq dataset, we use label transfer to infer which celltypes are present in the new data.

One way to do this is via "label transfer", an integration-based machine learning method for classification. We use the [Seurat method](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) for label transfer here.

'
  
  cat(text)
  
}

4.3 Label transfer from reference scRNA-seq data

With a reference single-cell RNA-seq dataset, we use label transfer to infer which celltypes are present in the new data.

One way to do this is via “label transfer”, an integration-based machine learning method for classification. We use the Seurat method for label transfer here.

if(use_labeltransfer == T){
  
  
  labeltransfer_outdir_indi <- paste0(outdir_indi, '/labeltransfer/')
  dir.create(labeltransfer_outdir_indi, recursive = T)
  
  reference <- readRDS(refdatapath)
  
  
  ### unfortunately, Seurat does not like underscores in feature names, so we need to replace
  if( any(grepl('_', reference$Celltype)) ){
    ct <- reference$Celltype
    ct <- as.character(ct)
    ct <- gsub('_', '-', ct)
    ct <- factor(ct, levels = names(sort(table(ct), decreasing = T)) )
    reference$Celltype <- ct
  }
  
  
  # #pre-process the reference data
  # # make sure this is done beforehand!!!
  # reference <- SCTransform(reference, ncells = 3000, verbose = verbose) %>%
  #   RunPCA(verbose = verbose) %>%
  #   RunUMAP(dims = 1:30, verbose = verbose)
  # 
  # reference$Celltype <- reference$subclass
  # 
  # refdatapath = "data/vignette/allen_cortex_preproc.rds"
  # saveRDS(reference, refdatapath)
  
  ### actually label-transfer ###
  
  
  
  sobjlist <- lapply(sobjlist, function(sobj){
    
    if(verbose == T){ message('\nLabel transfer for: ',sobj@project.name, '\n') }
    
    
    #get the integration score matrix
    
    
    # if( SeuratLabelTransfer.normalization.method == 'auto' ){
    #   
    #   
    #   if('SCT' %in% names(reference@assays) ){
    #     SeuratLabelTransfer.normalization.method <- 'SCT'
    #   } else{
    #     SeuratLabelTransfer.normalization.method <- 'LogNormalize'
    #   }
    #   
    #   
    # }
    SeuratLabelTransfer.normalization.method = 'SCT'
    
    anchors <- FindTransferAnchors(reference = reference, query = sobj, normalization.method = SeuratLabelTransfer.normalization.method, verbose = verbose)
    
    predictions.assay <- TransferData(anchorset = anchors, 
                                      refdata = reference$Celltype, 
                                      prediction.assay = T,
                                      weight.reduction = sobj[["pca"]], dims = 1:30, verbose = verbose)
    
    
    sobj[["predictions"]] <- predictions.assay
    
    
    #get the top calls for each cell
    
    topcalls <- TransferData(anchorset = anchors, refdata = reference$Celltype, prediction.assay = F,
                             weight.reduction = sobj[["pca"]], dims = 1:30, verbose = verbose)
    
    levs <- names( sort(table(topcalls$predicted.id), decreasing = T) )
    sobj$top_celltype_call_seurat <- factor(topcalls$predicted.id, levels = levs)
    sobj$top_celltype_call_seurat_score <- topcalls$prediction.score.max
    
    #add thresholded score
    md <- sobj@meta.data
    md$top_celltype_thresholded <- md$top_celltype_call_seurat
    md[md$top_celltype_call_seurat_score < 0.3, 'top_celltype_thresholded'] <- NA
    sobj$top_celltype_thresholded <- md$top_celltype_thresholded
    
    
    
    
    ### get cluster-wise labeltransfer max ###
    #get predmat
    predmat <- sobj@assays$predictions@data
    
    #drop "max"
    predmat <- predmat[1:(nrow(predmat)-1), ]
    
    #for each cluster, get max
    md <- sobj@meta.data
    clustname <- 'seurat_clusters'
    
    clustmaxlist <- lapply(levels(md[,clustname]), function(clust){
      clustcells <- rownames( md[md[,clustname] == clust,] )
      clustpred <- predmat[,colnames(predmat) %in% clustcells,drop=F]
      
      clust_avgscores <- Matrix::rowMeans(clustpred)
      maxscore <- clust_avgscores[which.max(clust_avgscores)]
      
      data.frame(cluster = clust, max = names(maxscore), score = maxscore, row.names = NULL)
      
    })
    
    clustmaxdf <- dplyr::bind_rows(clustmaxlist)
    
    
    #make a label
    clustmaxdf$label <- paste0( clustmaxdf$max)
    
    
    #uncertainty... if < 0.3, put as NA
    # if between 0.3 - 0.6, put putative
    clustmaxdf[clustmaxdf$score < 0.3,'label'] <- NA
    clustmaxdf[clustmaxdf$score >= 0.3 & clustmaxdf$score < 0.5,'label'] <- paste0( clustmaxdf[clustmaxdf$score >= 0.3 & clustmaxdf$score < 0.5,'label'],
                                                                                    '_putative')
    
    
    
    
    
    #add to metadata
    sobj$celltype_cluster_prediction <- sobj$seurat_clusters
    sobj$celltype_cluster_prediction <- plyr::mapvalues(sobj$celltype_cluster_prediction,
                                                        from = levels(sobj$celltype_cluster_prediction),
                                                        to = clustmaxdf$label)
    
    
    #write out cluster-celltype mapping
    labeltransfer_outdir_indi_code <- paste0(labeltransfer_outdir_indi, '/', sobj@project.name, '/')
    dir.create(labeltransfer_outdir_indi_code)
    
    clustcelltypemapfile <- paste0(labeltransfer_outdir_indi_code, '/ClusterCelltypeMapping.csv')
    write.csv(x=clustmaxdf,
              file = clustcelltypemapfile, 
              row.names = F, quote = F)
    
    
    sobj
    
  })
  
  
  
  
  
  
  
  
  
  # m_reference <- readRDS(m_reference)
  
  # actually we only need this later. read in at the righ time. for now save path
  m_reference_path <- m_reference
  
  
  
  
  
  
  rm(reference) #for memory saving
  
  
  # ```
  
  invisible(gc(full = T, reset = F, verbose = F))
  
  
  
}

5 Individual sample clustering summary plots (pre-integration)

In each sample, we perform clustering and label transfer. Here we plot the clusters and cluster markers for each sample. This can help identify which cell types are present in each sample.

#save each as pdf, one per sample...
dir.create( paste0(outdir_indi, '/individualsample_plots/') , recursive = T)


#prep summary plots for each sample
# umap of clusters
# insert QC per cluster plots here?
# cluster markers

summaryplots_individualsamples <- lapply( sobjlist , function(sobj){
  
  # set up title
  sampname <- sobj@project.name
  
  #get markers
  m <- mlist_individualsamples_clusters[[sampname]]
  
  
  
  #clusters plot
  # for auto plotting with manually set res, need to use paste here...
  
  plottingvar <- paste0('SCT_snn_res.', res_indi)
  
  #dimplot of clusters
  d1_a <- wrap_plots(
    DimPlot(sobj, group.by = plottingvar, label = T, repel = T)
  ) + plot_annotation(title = sampname, caption = 'Louvain Clusters plotted on UMAP')
  
  
  
  # ## 3d umap ##
  # embs <- as.data.frame(sobj@reductions$UMAP3D@cell.embeddings)
  # embs <- cbind(embs, sobj$seurat_clusters); colnames(embs)[4] <- 'seurat_clusters'
  # embs$cluster_color <- plyr::mapvalues(embs$seurat_clusters, 
  #                                       from = levels(embs$seurat_clusters), 
  #                                       to = scales::hue_pal()(length(levels(embs$seurat_clusters))))
  # #add labels...
  # labdf <- aggregate(cbind(UMAP3D_1, UMAP3D_2, UMAP3D_3) ~ seurat_clusters, embs, median)
  # labdf$cluster_color <- levels(embs$cluster_color)
  # 
  # #store inrgedients, run rgl when its time to print
  # three_d_umap_ingrdients <- list(embs = embs, labdf=labdf)
  # 
  # 
  
  
  # qc plots, do it with patchwork
  d0 <- wrap_plots(ncol = 2, list(
    VlnPlot(sobj, 'nCount_RNA', pt.size = 0.1)+NoLegend() ,
    VlnPlot(sobj, 'nFeature_RNA', pt.size = 0.1)+NoLegend(),
    VlnPlot(sobj, 'percent.mito', pt.size = 0.1)+NoLegend() ,
    VlnPlot(sobj, 'percent.hemoglobin', pt.size = 0.1)+NoLegend()
  )) + plot_annotation(title = sampname)
  
  
  # cluster markers
  
  
  DefaultAssay(sobj) <- 'SCT'
  n <- 5
  top <- m %>% group_by(cluster) %>% top_n(n = n, wt = score)
  genes <- top$gene
  #make sure genes are in 
  if( any( !(genes %in% rownames(sobj@assays$SCT@scale.data)) ) ){
    
    #try getresidual...
    missinggenes <- genes[!(genes %in% rownames(sobj@assays$SCT@scale.data))]
    sobj <- GetResidual(sobj, missinggenes, na.rm = F, replace.value = T)
    
    #it can be complicated doing this after integration, some genes are NAs... 
    scgem <- sobj@assays$SCT@scale.data
    
    if( any( !complete.cases(scgem) ) ){
      scgem <- scgem[complete.cases(scgem),]
      top <- top[top$gene %in% rownames(scgem),]
      sobj@assays$SCT@scale.data <- scgem
    }
    rm(scgem)
    
  }
  
  #prep heatmap
  top <- top[top$gene %in% rownames(sobj),]
  gem <- sobj@assays$SCT@scale.data
  gem <- gem[match(top$gene, rownames(gem)),]
  #annot for clusters
  #first order gem by cluster...
  md <- sobj@meta.data
  md <- md[order(md$seurat_clusters),]
  gem <- gem[,match(rownames(md), colnames(gem))]
  clust_bc <- setNames(md$seurat_clusters,
                       nm = colnames(gem)
  )
  col_clust <- setNames(scales::hue_pal()(length(levels(sobj$seurat_clusters))),
                        nm = levels(sobj$seurat_clusters))
  ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, col = list(Cluster = col_clust), show_legend = F)
  #annot for markers
  #set genes according to ct_ordered
  top$cluster <-factor(top$cluster, levels=unique(top$cluster))
  # top <- top[order(top$cluster),]
  gem <- gem[match(top$gene, rownames(gem)),]
  ct_gene <- setNames(top$cluster,
                      nm=top$gene)
  col_gene <- col_clust
  col_gene <- col_gene[names(col_gene) %in% top$cluster]
  ha_genes <- ComplexHeatmap::rowAnnotation(Cluster = ct_gene, col = list(Cluster = col_gene),
                                            show_annotation_name=F)
  #restrict range
  gem[gem>5] <- 5
  gem[gem<-5] <- -5
  
  #actual heatmap
  hm <- ComplexHeatmap::Heatmap(gem,
                                # column_title = 'Integrated clusters',
                                column_labels = rep('', ncol(gem)),
                                row_names_gp = grid::gpar(fontsize = 5),
                                column_split = md$seurat_clusters,
                                row_split = top$cluster,
                                row_title_gp = grid::gpar(fontsize = 5),
                                row_gap = unit(0.8, "mm"), 
                                column_gap = unit(0.8, "mm"),
                                row_title_rot = 0,
                                column_title_rot = 45,
                                column_title_gp = grid::gpar(fontsize = 7),
                                name = 'Scaled\nExpression',
                                cluster_columns = F,
                                cluster_rows = F,
                                top_annotation = ha_clust,
                                left_annotation = ha_genes,
                                use_raster = F)
  
  # invisible(hm <- draw(hm, column_title = sampname))
  
  
  list(
    d1_a=d1_a,
    # three_d_umap_ingrdients = three_d_umap_ingrdients,
    
    hm=hm,
    d0=d0
  )
  
} )


#name the list elements, one per sample

names(summaryplots_individualsamples) <- sapply(sobjlist, function(sobj){sobj@project.name}, simplify = T)





#save as PDFs
lapply(names(summaryplots_individualsamples), function(sampname){
  pdf( paste0(outdir_indi, '/individualsample_plots/', sampname, '.pdf'), width = 9, height = 9 )
  
  print( summaryplots_individualsamples[[sampname]]$d1_a )
  print( summaryplots_individualsamples[[sampname]]$hm )
  print( summaryplots_individualsamples[[sampname]]$d0 )
  
  dev.off()
  
  
})




if(use_labeltransfer == T){
  
  
  
  ## plots of cell type predictions from label transfer
  
  ctplots_individualsamples <- lapply( sobjlist, function(sobj){
    
    #set up title
    sampname <- sobj@project.name
    
    
    
    
    #clusters plot
    # for auto plotting with manually set res, need to use paste here...
    
    plottingvar <- paste0('SCT_snn_res.', res_indi)
    
    #dimplot of clusters
    d1_a <- wrap_plots(
      DimPlot(sobj, group.by = plottingvar, label = T, repel = T)
    ) + plot_annotation(title = sampname, caption = 'Louvain Clusters plotted on UMAP')
    
    
    
    
    ### prep the Celltype Plots ("ctplots")
    
    
    # violinplots of all celltypes
    
    #get exp of the celltypes
    rs <- Matrix::rowSums(sobj@assays$predictions@data)
    rs <- head( rs , -1)
    
    #keep only celltypes that are exp
    rs <- sort(rs[rs>0], decreasing = T)
    ct_in <- names(rs)
    
    #plots of all celltypes in data...
    DefaultAssay(sobj) <- 'predictions'
    
    ct_plots <- lapply(setNames(ct_in,ct_in), function(ct){
      #FeaturePlot(sobj, features = ct) + ggtitle(sampname)
      
      ct_plot <-  VlnPlot(sobj, ct)+NoLegend()+ylab('Prediction score') + 
        plot_annotation(title = sampname)
      
      return(ct_plot)
      
    })
    DefaultAssay(sobj) <- 'SCT'
    
    
    
    
    #heatmap of the prediction scores per cluster
    # use scaled values, comparable between clusters
    # get avgs
    avgs <- AverageExpression(sobj, assays = 'predictions', return.seurat = F)
    #remove max
    avgs <- head( as.data.frame(avgs) , -1)
    # remove empty prediction rows with all 0s
    avgs <- avgs[Matrix::rowSums(avgs)>0,]
    #make column titles nicer
    colnames(avgs) <- gsub('predictions.', 'cluster_', colnames(avgs))
    #format as numeric matrix
    avgs <- as.matrix(avgs)
    ### scale --> this emphasizes diffs between clusters, seems to put related cell types together
    avgs <- t(scale(t(avgs)))
    #select middle value  for color scale
    medval <- mean(avgs)
    #plot it
    hm_ctscores <- ComplexHeatmap::Heatmap(avgs,
                                           name = 'Scaled\nmean prediction\nscores per cluster',
                                           column_title = sampname,
                                           rect_gp = grid::gpar(col = "white", lwd = 0.5),
                                           circlize::colorRamp2(c(min(avgs), medval, max(avgs)), c("blue", "white", "red")))
    
    
    
    
    
    #for dotplot, add dendrogram and row label order
    # it may throw a warning about drawing plot first etc, 
    #shouldn't be an issue if we set seed, which we did
    suppressWarnings(
      dend <- row_dend(hm_ctscores)
    )
    
    #get ordered row labels for dotplot and other plots
    suppressWarnings(
      ct_ordered <- rownames(hm_ctscores@matrix)[ComplexHeatmap::row_order(hm_ctscores)]
    )
    
    
    nicedend <- ggdendro::ggdendrogram(rev(dend), rotate = T) +   
      scale_y_reverse(expand = c(0.05, 0))+
      theme(axis.text.y = element_blank(),
            axis.text.x = element_blank())
    
    dp_ctscores <- DotPlot(sobj, assay = 'predictions', rev(ct_ordered)) + 
      coord_flip() + 
      theme(axis.title.y=element_blank(),
            axis.text.y =  element_text(hjust = 0),
            axis.text.x = element_text(size=5) )+
      scale_color_gradient2(low = 'blue', high = 'red', mid = 'grey')+
      xlab(label = 'Cluster')+
      guides(color = guide_colorbar(title = "Scaled Average\nPrediction Score"))
    
    dp_ctscores <- patchwork::wrap_plots(list(nicedend, dp_ctscores), widths = c(0.2,1))
    
    
    
    
    
    # heatmap of reference markers
    m_reference <- readRDS(m_reference_path)
    
    
    #make sure to remove underscores, since seurat doesn't like it in label transfer feature names
    m_ref_small <- m_reference
    
    lt_ref_levs <- levels(m_ref_small$cluster)
    lt_ref_levs <- gsub('_', '-', lt_ref_levs)
    m_ref_small$cluster <- plyr::mapvalues(m_ref_small$cluster, from = levels(m_ref_small$cluster), lt_ref_levs)
    
    m_ref_small <- m_ref_small[m_ref_small$gene %in% rownames(sobj),]
    m_ref_small <- m_ref_small[m_ref_small$cluster %in% ct_in,]
    n <- 5
    top <- m_ref_small %>% group_by(cluster) %>% top_n(n = n, wt = avg_log2FC)
    
    genes <- top$gene
    
    
    #make sure genes are in 
    if( any( !(genes %in% rownames(sobj@assays$SCT@scale.data)) ) ){
      missinggenes <- genes[!(genes %in% rownames(sobj@assays$SCT@scale.data))]
      sobj <- GetResidual(sobj, genes)
    }
    
    #prep heatmap
    top <- top[top$gene %in% rownames(sobj),]
    gem <- sobj@assays$SCT@scale.data
    gem <- gem[match(top$gene, rownames(gem)),]
    
    #annot for clusters
    #first order gem by cluster...
    md <- sobj@meta.data
    md <- md[order(md$seurat_clusters),]
    gem <- gem[,match(rownames(md), colnames(gem))]
    
    clust_bc <- setNames(md$seurat_clusters,
                         nm = colnames(gem)
    )
    
    col_clust <- setNames(scales::hue_pal()(length(levels(sobj$seurat_clusters))),
                          nm = levels(sobj$seurat_clusters))
    
    ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, 
                                                  col = list(Cluster = col_clust),
                                                  show_legend = F
    )
    
    
    
    #annot for markers
    
    #set genes according to ct_ordered
    top$cluster <-factor(top$cluster, levels=ct_ordered)
    top <- top[order(top$cluster),]
    gem <- gem[match(top$gene, rownames(gem)),]
    
    ct_gene <- setNames(top$cluster,
                        nm=top$gene)
    coul <- RColorBrewer::brewer.pal(8, "Set2") 
    coul <- colorRampPalette(coul)(length(unique(top$cluster)))
    col_gene <- setNames(coul, nm = unique(top$cluster))
    
    ha_genes <- ComplexHeatmap::rowAnnotation(Celltype = ct_gene, col = list(Celltype = col_gene),
                                              show_annotation_name=F)
    
    
    #restrict range
    gem[gem>5] <- 5
    gem[gem<-5] <- -5
    
    
    #actual heatmap
    hm_refmarkers <- ComplexHeatmap::Heatmap(gem,
                                             #column_title = sampname,
                                             column_labels = rep('', ncol(gem)),
                                             row_names_gp = grid::gpar(fontsize = 5),
                                             column_split = md[,paste0('SCT_snn_res.', res_indi)],
                                             row_split = top$cluster,
                                             row_title_gp = grid::gpar(fontsize = 5),
                                             row_gap = unit(0.8, "mm"), 
                                             column_gap = unit(0.8, "mm"),
                                             column_title_rot = 45,
                                             column_title_gp = grid::gpar(fontsize = 7),
                                             row_title_rot = 0,
                                             name = 'Scaled\nExpression',
                                             cluster_columns = F,
                                             cluster_rows = F,
                                             top_annotation = ha_clust,
                                             left_annotation = ha_genes,
                                             use_raster = F)
    
    
    # hm_refmarkers <- draw(hm_refmarkers, column_title = sampname)
    
    
    
    #try to average the matrix
    avgl <- lapply(levels(md$seurat_clusters), function(clust){
      mdc <- md[md$seurat_clusters==clust,]
      gemc <- gem[,colnames(gem) %in% rownames(mdc)]
      avg <- matrix(rowMeans(gemc), 
                    dimnames = list(rownames(gem), clust))
      avg
    })
    avg <- do.call('cbind',avgl)
    
    #need to re-prep column annot
    clust_bc <- factor(str_sort(colnames(avg), numeric = T), levels = str_sort(colnames(avg), numeric = T))
    
    col_clust <- setNames(scales::hue_pal()(length(levels(sobj$seurat_clusters))),
                          nm = levels(sobj$seurat_clusters))
    
    ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, col = list(Cluster = col_clust), show_legend = F)
    
    
    hm_refmarkers_avg <- ComplexHeatmap::Heatmap(avg,
                                                 #column_title = sampname,
                                                 row_names_gp = grid::gpar(fontsize = 6),
                                                 column_split = factor(str_sort(colnames(avg), numeric = T), levels = str_sort(colnames(avg), numeric = T)), 
                                                 column_names_gp = grid::gpar(fontsize = 6),
                                                 column_title_rot = 45,
                                                 column_names_rot = 0, 
                                                 
                                                 row_split = top$cluster,
                                                 row_title_gp = grid::gpar(fontsize = 6),
                                                 row_gap = unit(0.5, "mm"), 
                                                 column_gap = unit(0.5, "mm"),
                                                 row_title_rot = 0,
                                                 name = 'Scaled\nExpression',
                                                 cluster_columns = F,
                                                 cluster_rows = F,
                                                 top_annotation = ha_clust,
                                                 left_annotation = ha_genes,
                                                 use_raster = F)
    
    
    # hm_refmarkers_avg <- draw(hm_refmarkers_avg, column_title = sampname)
    
    
    
    
    #top cell type plots
    plottingvar <- 'top_celltype_thresholded'
    
    
    
    d2_a <- wrap_plots(
      DimPlot(sobj, group.by = plottingvar, label = T, repel = T)
    ) + plot_annotation(title = sampname)
    
    d2_b <- wrap_plots(
      FeaturePlot(sobj, 'top_celltype_call_seurat_score')) + plot_annotation(title = sampname)
    
    #alluvial plot, clusters to cell types
    md <- sobj@meta.data
    labelsdf <- md[,c('seurat_clusters', 'top_celltype_thresholded')]
    
    
    ap <- wrap_plots( alluvialplot(labelsdf, repel = T, direction='y')
    ) + plot_annotation(title = sampname)
    
    
    
    ## read in cluster CT mapping
    #write out cluster-celltype mapping
    labeltransfer_outdir_indi_code <- paste0(labeltransfer_outdir_indi, '/', sobj@project.name, '/')
    
    clustcelltypemapfile <- paste0(labeltransfer_outdir_indi_code, '/ClusterCelltypeMapping.csv')
    
    clustmaxdf <- read.csv(clustcelltypemapfile)
    
    
    #also plot the cluster celltype call on dimplot
    d3 <- DimPlot(sobj, group.by = 'celltype_cluster_prediction', label = T, repel = T)
    
    
    
    #using heatmap dendrogram, order in a cool way
    suppressWarnings(
      ct_ordered <- rownames(hm_ctscores@matrix)[ComplexHeatmap::row_order(hm_ctscores)]
    )
    
    ct_plots <- ct_plots[ct_ordered] 
    
    
    
    list(d1_a=d1_a, 
         hm_ctscores=hm_ctscores,
         dp_ctscores=dp_ctscores,
         hm_refmarkers=hm_refmarkers,
         hm_refmarkers_avg=hm_refmarkers_avg,
         
         d2_a=d2_a,
         d2_b=d2_b,
         ap=ap,
         
         d3 = d3,
         clustmaxdf=clustmaxdf,
         
         ct_plots=ct_plots
    )
    
    
  })
  
  
  #name them with the code names
  
  names(ctplots_individualsamples) <- sapply(sobjlist, function(sobj){sobj@project.name}, simplify = T)
  
  #save as PDFs
  
  lapply(names(ctplots_individualsamples), function(sampname){
    
    #print to labeltransfer df
    
    labeltransfer_outdir_indi_code <- paste0(labeltransfer_outdir_indi, '/', sampname, '/')
    pdf( paste0(labeltransfer_outdir_indi_code, '/LabelTransferSummaryPlots.pdf'), width = 9, height = 9 )
    
    ## print each but make sure table gets printed as a pdftable ##
    ctplots_thissamp <- ctplots_individualsamples[[sampname]]
    for(i in 1:length(ctplots_thissamp) ){
      
      if(names(ctplots_thissamp)[i] == 'clustmaxdf'){
        print( scDAPP::pdftable(ctplots_thissamp[[i]], title = 'Cluster-Celltype Mapping') )
      } else{
        print(ctplots_thissamp[[i]])
      }
      
    }
    
    dev.off()
    
    return(sampname)
    
  })
  
  
  
}
#print to report

# do this in a way that creates a section for each sample...
# https://stackoverflow.com/questions/36674824/use-loop-to-generate-section-of-text-in-rmarkdown

template <- "


## Sample %s


" # don't forget the newline




for (i in 1:length( names(summaryplots_individualsamples) ) ) {
  sampname <- names(summaryplots_individualsamples)[i]
  
  cat(sprintf(template, sampname))
  
  
  sampsumplots <- summaryplots_individualsamples[[i]]
  
  
  
  #plot each plot one at a time with appropriate label
  
  plotlab <- "


### UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.


"
  
  cat( plotlab )
  print( sampsumplots$d1_a )
  
  
  
  #   ## 3d umap ##
  #   
  #   
  #   plotlab <- "
  # 
  # 
  # ### 3D UMAP of clusters
  # 
  # Here we plot a 3D UMAP of the Louvain clusters for this sample.
  # 
  # 
  # "
  #   
  #   cat( plotlab )
  #   print( sampsumplots$d1_a )
  #   
  #   embs <- as.data.frame(sobj@reductions$UMAP3D@cell.embeddings)
  #   embs <- cbind(embs, sobj$seurat_clusters); colnames(embs)[4] <- 'seurat_clusters'
  #   embs$cluster_color <- plyr::mapvalues(embs$seurat_clusters, 
  #                                         from = levels(embs$seurat_clusters), 
  #                                         to = scales::hue_pal()(length(levels(embs$seurat_clusters))))
  #   #add labels...
  #   labdf <- aggregate(cbind(UMAP3D_1, UMAP3D_2, UMAP3D_3) ~ seurat_clusters, embs, median)
  #   labdf$cluster_color <- levels(embs$cluster_color)
  #   
  #   #store inrgedients, run rgl when its time to print
  #   three_d_umap_ingrdients <- summaryplots_individualsamples$three_d_umap_ingrdients
  #   embs <- three_d_umap_ingrdients$embs
  #   labdf <- three_d_umap_ingrdients$labdf
  #   
  #   
  #   UMAP3D_1 = embs$UMAP3D_1; UMAP3D_2 = embs$UMAP3D_2; UMAP3D_3 = embs$UMAP3D_3
  #   rgl::plot3d(UMAP3D_1, UMAP3D_2, UMAP3D_3, col = embs$cluster_color)
  #   rgl::text3d(labdf$UMAP3D_1, labdf$UMAP3D_2, labdf$UMAP3D_3, texts = labdf$seurat_clusters, 
  #               adj = c(2,2,2))
  #   
  #   
  
  
  plotlab <- "


### Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).


"
  
  cat( plotlab )
  print( sampsumplots$hm )
  
  
  plotlab <- "


### QC plots 

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.


"
  
  cat( plotlab )
  print( sampsumplots$d0 )
  
}

5.1 Sample Healthy_1

5.1.1 UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.

5.1.2 Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).

5.1.3 QC plots

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.

5.2 Sample Healthy_2

5.2.1 UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.

5.2.2 Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).

5.2.3 QC plots

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.

5.3 Sample Covid_Mild_1

5.3.1 UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.

5.3.2 Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).

5.3.3 QC plots

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.

5.4 Sample Covid_Mild_2

5.4.1 UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.

5.4.2 Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).

5.4.3 QC plots

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.

5.5 Sample Covid_Critical_1

5.5.1 UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.

5.5.2 Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).

5.5.3 QC plots

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.

5.6 Sample Covid_Critical_2

5.6.1 UMAP of clusters

Here we plot a UMAP of the Louvain clusters for this sample.

5.6.2 Heatmap of cluster markers

Here we plot the top 5 marker genes of each cluster as identified by wilcoxon test. Sometimes, clusters may share the same markers, which may indicate the clusters are similar, such as subtypes of the same cell type (for example, CD4 vs CD8 T cells).

5.6.3 QC plots

Here we plot some QC information, including the number of UMIs (nCount_RNA) and the number of unique genes (nFeature_RNA). Very low values may indicate a low number of cells in a specific area or low quality. If the clustering and markers look very strange, the quality metrics may help diagnose issues by indicating a cluster of poor quality cells.

if(use_labeltransfer == T){
  
  
  text <- '
# Individual sample label transfer cell type predictions

For each sample we perform label transfer from a reference single-cell RNA-seq dataset. Here, we plot the results of the label transfer. Using a prior single-cell RNAseq dataset, we learn which cell types are present in the current data.

Using label transfer, we can get a score for each cell type in the reference dataset. A high score indicates a strong and confident match, while a low score indicates a match of low confidence and may represent a cell type that was not captured in the reference.

Note that we apply a label transfer score cutoff of 0.3. If the cells are below this, they are marked as NA. These may represent cells missing from the reference.


It can be complex and time-consuming to analyze each sample individually before integration; however, the reward is often worth the effort. Characterizing each sample allows us to assess the presence of cell types in each sample carefully. For example, we may find that all the samples contain neuron cells, while only one sample may contain macrophage cells, but other samples do not. This is useful for interpreting the downstream results of differential expression across samples; for example, it would be wise to focus on a comparative analysis of the cell types in all samples (neurons), and simply note that macrophages are present in some but missing in other samples.

Additionally, it is also useful to observe how the cell types are related to one another in the individual samples. We may observe that immune cell types like T cells and B cells may cluster closely together, while very different cell types, such as neurons and endothelial cells, may cluster far apart. Identifying such clustering patterns in individual samples may help identify cases of "over-integration" or other issues.
'
  
  cat(text)
  
  
}

6 Individual sample label transfer cell type predictions

For each sample we perform label transfer from a reference single-cell RNA-seq dataset. Here, we plot the results of the label transfer. Using a prior single-cell RNAseq dataset, we learn which cell types are present in the current data.

Using label transfer, we can get a score for each cell type in the reference dataset. A high score indicates a strong and confident match, while a low score indicates a match of low confidence and may represent a cell type that was not captured in the reference.

Note that we apply a label transfer score cutoff of 0.3. If the cells are below this, they are marked as NA. These may represent cells missing from the reference.

It can be complex and time-consuming to analyze each sample individually before integration; however, the reward is often worth the effort. Characterizing each sample allows us to assess the presence of cell types in each sample carefully. For example, we may find that all the samples contain neuron cells, while only one sample may contain macrophage cells, but other samples do not. This is useful for interpreting the downstream results of differential expression across samples; for example, it would be wise to focus on a comparative analysis of the cell types in all samples (neurons), and simply note that macrophages are present in some but missing in other samples.

Additionally, it is also useful to observe how the cell types are related to one another in the individual samples. We may observe that immune cell types like T cells and B cells may cluster closely together, while very different cell types, such as neurons and endothelial cells, may cluster far apart. Identifying such clustering patterns in individual samples may help identify cases of “over-integration” or other issues.

if(use_labeltransfer == T){
  
  #print to report
  
  # do this in a way that creates a section for each sample...
  # https://stackoverflow.com/questions/36674824/use-loop-to-generate-section-of-text-in-rmarkdown
  
  template_samples <- "


## Sample %s


"
  
  
  template_plots <- "


### %s


"
  
  template_celltypes <- "


#### %s


"
  
  
  for (i in 1:length( names(ctplots_individualsamples) ) ) {
    sampname <- names(ctplots_individualsamples)[i]
    
    cat(sprintf(template_samples, sampname))
    
    
    ctplotlist <- ctplots_individualsamples[[i]]
    
    #print them, with titles
    
    
    plotlab <- "


### Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.


"
    
    cat( plotlab )
    print(ctplotlist$hm_ctscores)
    
    
    
    plotlab <- "


### Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above. 

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.


"
    
    cat( plotlab )
    
    print(ctplotlist$dp_ctscores)
    
    
    
    
    
    plotlab <- "


### Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.



"
    
    cat( plotlab )
    
    print(ctplotlist$hm_refmarkers)
    
    
    
    plotlab <- "


### Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.




"
    
    cat( plotlab )
    
    print(ctplotlist$hm_refmarkers_avg)
    
    
    
    
    
    plotlab <- "


### UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.



"
    
    cat( plotlab )
    
    print(ctplotlist$d2_a)
    
    
    plotlab <- "


### UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.


"
    
    cat( plotlab )
    
    
    
    print(ctplotlist$d2_b)
    
    
    
    plotlab <- "


### Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster.
The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.



"
    
    cat( plotlab )
    
    
    print(ctplotlist$ap)
    
    
    
    plotlab <- '


### Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset.
Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix "_putative" to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

'
    
    cat( plotlab )
    
    clustmaxprint <- ctplotlist$clustmaxdf
    clustmaxprint$score <- round(clustmaxprint$score, digits = 5)
    print(knitr::kable(clustmaxprint))
    
    
    
    plotlab <- "


### UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.



"
    
    cat( plotlab )
    
    
    print(ctplotlist$d3)
    
    
    
    plotlab <- "


### Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.


"
    
    cat( plotlab )
    
    for(j in 1:length(ctplotlist$ct_plots) ){
      
      ctname <- names(ctplotlist$ct_plots[j])
      
      cat(sprintf(template_celltypes, ctname))
      
      print( ctplotlist$ct_plots[[j]] )
      
    }
    
    
    
    
    
    
  }
  
  
  
}

6.1 Sample Healthy_1

6.1.1 Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

6.1.2 Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above.

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.

6.1.3 Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.

6.1.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

6.1.5 UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.

6.1.6 UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.

6.1.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster. The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.

6.1.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
0 NK 0.96341 NK
1 MNP 1.00000 MNP
2 T 0.99916 T
3 B 0.97787 B
4 T 0.91109 T
5 B 1.00000 B
6 MNP 1.00000 MNP

6.1.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

6.1.10 Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.

6.1.10.1 MNP

6.1.10.2 B

6.1.10.3 NK

6.1.10.4 T

6.2 Sample Healthy_2

6.2.1 Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

6.2.2 Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above.

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.

6.2.3 Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.

6.2.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

6.2.5 UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.

6.2.6 UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.

6.2.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster. The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.

6.2.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
0 NK 0.97930 NK
1 MNP 1.00000 MNP
2 T 1.00000 T
3 B 0.97849 B
4 B 0.99921 B
5 T 0.99633 T
6 MNP 1.00000 MNP

6.2.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

6.2.10 Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.

6.2.10.1 NK

6.2.10.2 T

6.2.10.3 B

6.2.10.4 MNP

6.3 Sample Covid_Mild_1

6.3.1 Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

6.3.2 Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above.

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.

6.3.3 Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.

6.3.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

6.3.5 UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.

6.3.6 UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.

6.3.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster. The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.

6.3.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
0 B 0.98125 B
1 NK 0.98922 NK
2 MNP 0.99991 MNP
3 T 0.99811 T
4 T 0.98932 T
5 NK 0.99483 NK

6.3.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

6.3.10 Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.

6.3.10.1 NK

6.3.10.2 B

6.3.10.3 MNP

6.3.10.4 T

6.4 Sample Covid_Mild_2

6.4.1 Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

6.4.2 Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above.

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.

6.4.3 Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.

6.4.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

6.4.5 UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.

6.4.6 UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.

6.4.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster. The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.

6.4.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
0 NK 0.88159 NK
1 MNP 1.00000 MNP
2 B 0.96980 B
3 T 0.99507 T
4 T 0.93617 T

6.4.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

6.4.10 Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.

6.4.10.1 NK

6.4.10.2 B

6.4.10.3 MNP

6.4.10.4 T

6.5 Sample Covid_Critical_1

6.5.1 Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

6.5.2 Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above.

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.

6.5.3 Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.

6.5.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

6.5.5 UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.

6.5.6 UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.

6.5.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster. The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.

6.5.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
0 T 0.99916 T
1 MNP 1.00000 MNP
2 B 0.99378 B
3 NK 0.86349 NK
4 B 0.99557 B

6.5.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

6.5.10 Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.

6.5.10.1 MNP

6.5.10.2 T

6.5.10.3 NK

6.5.10.4 B

6.6 Sample Covid_Critical_2

6.6.1 Heatmap of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

6.6.2 Dotplot of label transfer cell type scores

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above.

However, the dotplot also includes information about how many cells in the cluster express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A blue dot indicates relatively low score. A small dot indicates few cells express the score.

6.6.3 Heatmap of reference cell type markers

Here we plot the top 5 reference celltype markers as sorted by average log2 fold change. The markers are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type. For example, a cluster with high T cell score should highly express T cell markers.

6.6.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

6.6.5 UMAP of top scoring celltypes

The top scoring celltype for each cell is plotted on the UMAP.

6.6.6 UMAP of label transfer scores

The scores of the cell types are plotted on the UMAP. This shows how confident the prediction is. If the score is low, it may be that the cells present in this dataset are missing from the reference.

6.6.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster. The top-scoring cell type is plotted, but some cells and clusters may contain a mix of cell types.

6.6.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
0 B 1.00000 B
1 MNP 1.00000 MNP
2 T 0.99799 T
3 NK 0.96310 NK
4 NK 0.63160 NK
5 T 0.98930 T

6.6.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

6.6.10 Violin plots for each cell type

Here we plot a violin plot for each cell type to analyze the label transfer scores type in detail.

6.6.10.1 MNP

6.6.10.2 B

6.6.10.3 NK

6.6.10.4 T

7 Integration of samples

Here, we combine all of the samples into a single unified dataset.

“Integration” refers to the process of combining together the data from individual samples with batch correction. Typically, library prep is done one sample at a time, which may introduce bias due to technical artifacts associated with processing for that sample, i.e., particular reagent lot used, slightly longer incubation periods, slightly more reads sequenced, etc. To combine samples, we thus apply special batch-correction procedures. The method for doing that here relies on Reference Principal Component Integration, implemented in the RISC package (Liu, Zheng et al Nat Biotech 2021). This approach exploits the fact that cells from the same type typically have similar transcriptomic patterns, and then tries to match the patterns.

Ultimately, the goal of integration is to minimize technical noise and maximize shared biological signals, the strongest of which is typically cell type.

Integration can be contrasted with “concatenation,” which involves simply merging the samples without any batch correction. If there are strong batch effects in the data, this method may result in clusters driven not by cell type but by the sample of origin (i.e., cluster 1 is all cells from sample 1, cluster 2 is all cells from sample 2).

# before integrating, add a few important columns to each sobj metadata
# "sample" column --> switch to Code
# "condition" column
# barcode column


#add sample to each sobj
codes <- sample_metadata$Code

sobjlist <- lapply(codes, function(code){
  
  sample_metadata_code <- sample_metadata[sample_metadata$Code == code,]
  
  sobj <- sobjlist[[code]]
  sobj$orig.ident <- code
  sobj$Code <- code
  sobj$Sample <- sample_metadata_code$Sample
  sobj
})

names(sobjlist) <- codes


#add a condition column
sobjlist <- lapply(names(sobjlist), function(sampname){
  
  cond <- sample_metadata[sample_metadata$Code == sampname,"Condition"]
  
  sobj <- sobjlist[[sampname]]
  sobj$Condition <- cond
  sobj
  
})

names(sobjlist) <- codes


#add a barcode column
sobjlist <- lapply(names(sobjlist), function(sampname){
  
  
  sobj <- sobjlist[[sampname]]
  md <- sobj@meta.data
  md <- cbind(rownames(md), md)
  colnames(md)[1] <- 'Barcode'
  sobj@meta.data <- md
  sobj
  
  
})

names(sobjlist) <- codes





# clean up env before doing this
rm(ctplotlist, ctplots_individualsamples, elbowplots, m_reference,
   cutoffplots, cutoffs, sampsumplots, 
   elbowplots,
   mlist_individualsamples_clusters, 
   qc_vln_feature, qc_vln_umi,
   summaryplots_individualsamples)



invisible(gc(full = T, reset = F, verbose = F))




#### save each object and get the metadata

#add clusters to each sobjlist object and then save each object
tmpobjdir <- paste0(outdir_indi, '/.tmp_Seurat_objects/')
dir.create(tmpobjdir, recursive = T)

mdlist <- lapply(sample_metadata$Code, function(code){
  
  sobj <- sobjlist[[code]]
  saveRDS(sobj, paste0(tmpobjdir, '/', code, '.rds'))
  
  md <- sobj@meta.data
  
  return(md)
  
})


names(mdlist) <- sample_metadata$Code



### qc purposes, read in  --> keep commented
# sobjlist <- lapply(sample_metadata$Code, function(code){
#   
#   message(code)
#   sobj <- readRDS( paste0(tmpobjdir, '/', code, '.rds') )
#   
#   sobj
#   
# })
# names(sobjlist) <- sample_metadata$Code



#remove gene list and seurat objects form env and purge
rm(sobjlist)
invisible(gc(full = T, reset = F, verbose = F))


#read in raw seurat objects, filter using mdlist
if(input_seurat_obj == T){
  
  matlist <- lapply(sample_metadata$Code, function(code){
    
    #get md for this object
    md <- mdlist[[code]]
    
    #read raw matrix
    samp <- sample_metadata[sample_metadata$Code == code,'Sample']
    
    #for HTO, we will assume saved objects
    sobjfile <- paste0(datadir, '/', samp, '.rds')
    sobj <- readRDS(sobjfile)
    
    #get RNA counts matrix
    # mat0 <- sobj@assays$RNA@counts
    mat0 <- GetAssayData(sobj, assay = 'RNA', layer = 'counts')
    
    
    #filter using sobj metadata
    mat0 <- mat0[,match(rownames(md), colnames(mat0))]
    
    
    return(mat0)
    
  })
} else{
  
  
  
  
  matlist <- lapply(sample_metadata$Code, function(code){
    
    #get md for this object
    md <- mdlist[[code]]
    
    #read raw matrix
    samp <- sample_metadata[sample_metadata$Code == code,'Sample']
    datafp <- paste0(datadir, '/', samp)
    # if on hpc, use below
    # datafp <- paste0(datadir, '/', samp, '/outs/')
    
    # for the dl data, we need to find the filepath
    h5_filename <- grep(pattern = 'filtered_feature_bc_matrix.h5',
                        list.files(datafp, recursive = T, full.names = T),
                        value = T)
    
    #get RNA counts matrix
    mat0 <- Read10X_h5(h5_filename)
    
    #filter using sobj metadata
    mat0 <- mat0[,match(rownames(md), colnames(mat0))]
    
    
    return(mat0)
    
  })
  
}

names(matlist) <- sample_metadata$Code

# keep intersect genes; will only remove genes if aligned with different GTFs
var0 <- Reduce(intersect, lapply(matlist, FUN = rownames))
matlist <- lapply(matlist, function(mat){ mat[match(var0, rownames(mat)), ] })
rm(var0)

#join matrices, filter genes from joint mat and proceed with union
bigmat <- do.call(cbind, matlist)

#get num cells expressing
num_nonzeros <- tabulate(bigmat@i + 1)

#get joint filtered genes as those exp by  >= 3 cells
joint_filt_genes <- rownames(bigmat)[num_nonzeros >= 3]



# prep the risc objects #
risclist <- lapply(sample_metadata$Code, function(code){
  
  #get md and mat0
  md <- mdlist[[code]]
  mat0 <- matlist[[code]]
  
  #subset matrix using jointly filtered genes
  mat0 <- mat0[rownames(mat0) %in% joint_filt_genes,]
  mat0 <- mat0[match(joint_filt_genes, rownames(mat0)),]
  
  
  #prep metadata for risc
  coldata0 <- md
  rm(md)
  
  #get barcodes, strip numeric suffix, and add samplename prefix
  barcodes <- stringr::str_split_fixed(rownames(coldata0), '-', 2)[,1]
  barcodes <- paste0(coldata0$orig.ident, '.',barcodes)
  
  coldata0 <- cbind(barcodes, coldata0)
  rm(barcodes)
  
  
  #make the rowdatadf...
  rowdata0 = data.frame(Symbol = rownames(mat0), row.names = rownames(mat0))
  
  #make the risc object
  ### make sure to set is.filter = F or it will still use sample-specific filtering
  dat0 = readsc(mat0, coldata0, rowdata0, is.filter = F)
  
  rm(mat0, rowdata0, coldata0)
  
  return(dat0)
})

names(risclist) <- sample_metadata$Code


## save matlist, we will add it to RNA assay later ##
outdir_int_objects <- paste0(outdir_int, '/data_objects/')
dir.create(outdir_int_objects, recursive = T)
saveRDS(matlist, paste0(outdir_int_objects, '/.concatmatrix.rds'))

#clean env
rm(bigmat, num_nonzeros, joint_filt_genes, matlist, mdlist)
invisible(gc(full = T, reset = F, verbose = F))






#### process each sample ####
# make sure to set ncore to 1 for all; we'll parallelize across samples for this

#prepare the RISC functions
process0 <- function(obj0){
  
  # Filter cells and genes
  message('scFilter')
  obj0 = RISC::scFilter(obj0, min.UMI = 0, max.UMI = Inf, min.gene = 0, min.cell = 0, is.filter = F)
  
  # Normalize the raw counts
  message('scNormalize')
  obj0 = RISC::scNormalize(obj0, ncore = 1)
  
  # Find highly variable genes
  message('scDisperse')
  obj0 = RISC::scDisperse(obj0)
  
  return(obj0)
}




### process each sample with RISC functions, in parallel
cl <- parallel::makeCluster(workernum)
doParallel::registerDoParallel(cl)

risclist <- foreach(dat0 = risclist,
                    .packages = c('RISC')) %dopar% 
  {
    return( process0(dat0) )
  }


parallel::stopCluster(cl)
invisible(gc(full = T, reset = F, verbose = F))
names(risclist) <- sample_metadata$Code



## variable genes - we still use all genes.
#get the intersect of gene names
var0 <- Reduce(intersect, lapply(risclist, FUN = function(x){x@rowdata$Symbol}))




#run inplot - just to show the plot later
#pdf null prevents premature plot showing
pdf( NULL )

ip <- InPlot(risclist, var.gene = var0, Std.cut = 0.95, ncore = workernum) 
ip <- patchwork::wrap_plots(ip)

dev.off()



### reference- either user-defined, or guess

if( !(is.null(risc_reference)) ){
  
  
  
  #write some text about which one was selected
  #cat()
  
  
  #if provided, we need the numeric index of which sample was given
  
  #it can be either sample or code name
  if(any(risc_reference %in% sample_metadata$Code)){
    ref <- grep(risc_reference, sample_metadata$Code)
  }
  
  #it can be either sample or code name
  if(any(risc_reference %in% sample_metadata$Sample)){
    ref <- grep(risc_reference, sample_metadata$Code)
  }
  
} 


#if not provided, still do automated guess but just use the provided one
# inplot is not automated and does not suggest which sample to use.
# instead, we use the one with most clusters, given that
# all samples were processed otherwise identically (same PCs and resolution)


# get num clusters from each sample
numclusts <- sapply(risclist, function(dat0){
  length(unique(dat0@coldata$seurat_clusters))
})

# get cell num score; higher cell num is prioritized
numcells_per_sample <- sapply(risclist, function(dat0){
  nrow(dat0@coldata)
})
numcells_per_sample <- numcells_per_sample / max(numcells_per_sample)

#multipy number of cluste4rs by cell num score
numclusts <- numclusts * numcells_per_sample

#average each cluster and get averaged variance?
# do NOT use pseudobulk, which adds up
pbvar <- sapply(risclist, function(dat0){
  
  mat <- dat0@assay$logcount
  md <- dat0@coldata
  
  #pseudobulk
  pb <- pseudobulk(obj = mat, metadata = md, grouping_colname_in_md = 'seurat_clusters')
  
  #average: divide pseudobulk columns by num cells
  numcells <- table(md$seurat_clusters)
  pb <- sweep(pb, 2, numcells, FUN = '/')
  
  clustervar <- apply(pb, 2, function(x){var(x)})
  
  mean(clustervar)
  
})

#multiply number of clusts * average of cluster variance
refscore <- numclusts * pbvar



### try to divide by KS? ###



#autoselect ref
maxautoscore <- which.max(refscore)

# set reference as autoselected; only if not provided
if( is.null(risc_reference) ){
  ref <- maxautoscore
}



#need to rearrange the list...
# the reference must be the first list element...
if(ref != 1){
  
  data0 <- list(risclist[[ref]])
  names(data0) <- names(risclist)[ref]
  
  for(i in 1:length(risclist)){
    if(i != ref){
      name = names(risclist)[i]
      data0[[name]] <- risclist[[i]]
    }
    
  }
  
} else{
  data0 <- risclist
}

rm(risclist)
invisible(gc(full = T, reset = F, verbose = F))
### actual integration ###
#set "eigens", num PCs to use for integration
eigens <- pcs_int

#actually integrate
data0 = scMultiIntegrate(
  objects = data0, eigens = eigens, add.Id = NULL, var.gene = var0,
  # method = "RPCI", 
  align = 'OLS', npc = 50, adjust = TRUE,
  ncore = workernum, 
  #do.fast = "AUTO"
)


rm(var0)
invisible(gc(full = T, reset = F, verbose = F))
# integrated UMAP
data0 = scUMAP(data0, npc = eigens, use = "PLS")

#### integrated clustering ####
### AS OF JUL 5 2023 USE scDAPP::scCluster_louvain_res()
# # integrated clustering
# # neighbor = 10 is default... maybe make it a variable too...?
# data0 <- scCluster(data0, 
#                    slot = "cell.pls",
#                    method = 'louvain',
#                    npc = eigens, 
#                    neighbor = 10
# )
# 
# ### remap clust, biggest to smallest...
# #get risc clust
# rc <- data0@coldata$Cluster
# 
# #sort by biggest to smallest
# bs <- sort(table(rc), decreasing = T)
# rc <- plyr::mapvalues(rc, from = names(bs), to = c(1:length(bs)) )
# rc <- factor(rc, levels = 1:length(bs))
# 
# #set to object
# data0@coldata$Cluster <- rc
# rm(rc,bs)

data0 = scDAPP::scCluster_louvain_res(object = data0,
                                    slot = "cell.pls",
                                    # method = 'louvain',
                                    npc = eigens,
                                    resolution = res_int,
                                    neighbor = RISC_louvain_neighbors,
)







#save it
outdir_int_objects <- paste0(outdir_int, '/data_objects/')
dir.create(outdir_int_objects)
saveRDS(data0, paste0(outdir_int_objects, '/RISC-object_integrated.rds'))



#cluster markers --> do this with seurat instead
# riscmarkertime <- proc.time()
# risc_clustermarkers = AllMarker(data0, ncore = workernum)
# riscmarkertime <- proc.time() - riscmarkertime
# takes about an hour, even with parallelization

#get matrix, umap, md, pca, etc; then put into seurat object

mat <- do.call(cbind, data0@assay$logcount)
md <- data0@coldata
umap <- data0@DimReduction$cell.umap
pca <- data0@DimReduction$cell.pls


#rename "Cluster" to "RISC cluster"
risc_clust_lab <- paste0('RISC_Louvain_npc', pcs_int, '_res', res_int)
colnames(md)[ncol(md)] <- risc_clust_lab
clustname <- risc_clust_lab

#make obj
sobjint <- CreateSeuratObject(counts = CreateAssayObject(data=mat),
                              assay = 'RISC', project = 'Integrated',
                              meta.data = md
)

#add dim reducs
sobjint[['umap']] <-  CreateDimReducObject(umap, assay = 'RISC', key = 'UMAP_')
sobjint[['pca']] <-  CreateDimReducObject(pca, assay = 'RISC', key = 'PCA_')





#clean env
rm(umap,pca,mat,md, data0)
invisible(gc(full = T, reset = F, verbose = F))

### find markers ###

#set default levels
sobjint <- SetIdent(sobjint, value = sobjint@meta.data[,risc_clust_lab])
sobjint$seurat_clusters <- sobjint@meta.data[,risc_clust_lab]


#save object
saveRDS(sobjint, paste0(outdir_int_objects, '/Seurat-object_integrated.rds'))


#save markers
intmarkersdir <- paste0(outdir_int, '/markergenes_intclusters/')
dir.create(intmarkersdir, recursive = T)
intmarkersfile <- paste0(intmarkersdir, '/intmarkers-npc', pcs_int, '-res', res_int,  '.csv')

#get markers, parallelized
# TURN OFF PAR FOR NOW, it seems to break things
# ALSO MAKE SURE NOT TO DO READ IN THING IN CASE OF CLUSTERING DIFFS
# future::plan('multisession', workers=workernum)

# if(!file.exists(intmarkersfile)){
#   
#   #Seurat clusters
#   seuratmarkertime <- proc.time()
# m_integrated_clusters <- FindAllMarkers(sobjint,
#                                         only.pos = T)
#   seuratmarkertime <- proc.time() - seuratmarkertime
#   
#   
#   # future::plan(strategy = 'sequential')
#   
#   # takes about an hour, future parallel is not stable and cause memory crash
#   
#   
# write.csv(m_integrated_clusters, intmarkersfile, quote = F, row.names = F)
#   
#   
# } else{
# m_integrated_clusters <- read.csv(intmarkersfile)
# 
# }


m_integrated_clusters <- FindAllMarkers(sobjint,
                                        only.pos = T)

# as of Nov 9 2023 (Seurat v5): add score to markers
m_integrated_clusters$score <- (m_integrated_clusters$pct.1 - m_integrated_clusters$pct.2) * m_integrated_clusters$avg_log2FC


write.csv(m_integrated_clusters, intmarkersfile, quote = F, row.names = F)
#remove sobjint for now, so we can read in each seurat object and save them.
# we'll read back in after
#add int clusters to each sobj in sobjlist with intmd
intmd <- sobjint@meta.data
rm(sobjint)
invisible(gc(full = T, reset = F, verbose = F))


## WE ALSO NEED PREDICTIONS ASSAY FROM INDIVIDUAL OBJECTS
# only if labeltransfer == T

#add clusters to each sobjlist object and then save each object
# shoudl already be created above...
outdir_indi_seuratobjs <- paste0(outdir_indi, '/processed_Seurat_objects/')
dir.create(outdir_indi_seuratobjs, recursive = T)

predictionmats <- lapply(sample_metadata$Code, function(code){
  
  #read in from temp dir
  sobj <- readRDS(paste0(tmpobjdir, '/', code, '.rds'))
  
  #subset intmd for this sample
  intmd_sub <- intmd[intmd$Code == code,]
  
  #get md for this sample from sobjlist (pre-int sobj)
  md <- sobj@meta.data
  
  #make sure they match (very likely they always will...?)
  intmd_sub <- intmd_sub[match(md$Barcode, intmd_sub$scBarcode),]
  
  #get clusters; clustname defined above
  sobj@meta.data[,clustname] <- intmd_sub[,clustname]
  
  #save each sample obj to proc obj dir
  invisible( saveRDS(sobj,
                     paste0(outdir_indi_seuratobjs, '/SeuratObject-', code, '.rds'))
  )
  
  if(use_labeltransfer == T){
    #return prediction assay with proper risc barcodes
    predictionmat <- sobj@assays$predictions@data
    colnames(predictionmat) <- intmd_sub$Barcode
    return(predictionmat)
  } else{
    return(code)
  }
  
})



#remove temp folder
unlink(tmpobjdir, recursive = T)



#read sobjint back in
sobjint <- readRDS( paste0(outdir_int_objects, '/Seurat-object_integrated.rds') )


#add predictions assay to integrated seurat object
if(use_labeltransfer==T){
  
  #combine prediction matrices and add to seurat obj
  predmat <- as(as.matrix(dplyr::bind_cols(predictionmats)), "sparseMatrix")
  rownames(predmat) <- rownames(predictionmats[[1]])
  
  #match order of barcodes
  predmat <- predmat[,match(rownames(intmd), colnames(predmat))]
  
  predassay <- CreateAssayObject(data=predmat)
  
  sobjint[['predictions']] <- predassay
  
  
  
  ### also predict int cluster-celltype mapping ###
  
  #get predmat
  predmat <- sobjint@assays$predictions@data
  
  #drop "max"
  predmat <- predmat[1:(nrow(predmat)-1), ]
  
  #for each cluster, get max
  md <- sobjint@meta.data
  clustname <- 'seurat_clusters'
  
  clustmaxlist <- lapply(levels(md[,clustname]), function(clust){
    clustcells <- rownames( md[md[,clustname] == clust,] )
    clustpred <- predmat[,colnames(predmat) %in% clustcells,drop=F]
    
    clust_avgscores <- Matrix::rowMeans(clustpred)
    maxscore <- clust_avgscores[which.max(clust_avgscores)]
    
    data.frame(cluster = clust, max = names(maxscore), score = maxscore, row.names = NULL)
    
  })
  
  clustmaxdf <- dplyr::bind_rows(clustmaxlist)
  
  
  #make a label
  clustmaxdf$label <- paste0( clustmaxdf$max)
  
  
  #uncertainty... if < 0.3, put as NA
  # if between 0.3 - 0.6, put putative
  clustmaxdf[clustmaxdf$score < 0.3,'label'] <- NA
  clustmaxdf[clustmaxdf$score >= 0.3 & clustmaxdf$score < 0.5,'label'] <- paste0( clustmaxdf[clustmaxdf$score >= 0.3 & clustmaxdf$score < 0.5,'label'],
                                                                                  '_putative')
  
  
  
  
  
  
  #add to metadata
  sobjint$INT_celltype_cluster_prediction <- sobjint$seurat_clusters
  sobjint$INT_celltype_cluster_prediction <- plyr::mapvalues(sobjint$INT_celltype_cluster_prediction,
                                                             from = levels(sobjint$INT_celltype_cluster_prediction),
                                                             to = clustmaxdf$label)
  
  
  
  #write out cluster-celltype mapping
  labeltransfer_outdir_int <- paste0(outdir_int, '/labeltransfer/')
  dir.create(labeltransfer_outdir_int)
  
  clustcelltypemapfile <- paste0(labeltransfer_outdir_int, '/IntClusterCelltypeMapping.csv')
  write.csv(x=clustmaxdf,
            file = clustcelltypemapfile, 
            row.names = F, quote = F)
  
  
  
  rm(predmat,predictionmats, predassay, clustmaxlist)
  
}




## clean up env
rm(predictionmats, intmd)

invisible(gc(full = T, reset = F, verbose = F))


## readin concat matrix with raw counts and put in seurat object ##
matlist <- readRDS( paste0(outdir_int_objects, '/.concatmatrix.rds') )

#for each matrix, change cell names to match seurat object
set_code <- paste0(sobjint$Set, '_', sobjint$Code)
set_code <- unique(set_code)
set_code <- str_split_fixed(set_code,pattern =  '_', n=2)
matlist <- lapply(1:length(matlist), function(i){
  mat <- matlist[[i]]
  code <- names(matlist)[i]
  set <- set_code[set_code[,2] == code,1]
  colnames(mat) <- paste0(set, '_', colnames(mat))
  mat
  
})

#join matrices, filter genes from joint mat and proceed with union
bigmat <- do.call(cbind, matlist)

#get num cells expressing
num_nonzeros <- tabulate(bigmat@i + 1)

#get joint filtered genes as those exp by  >= 3 cells
joint_filt_genes <- rownames(bigmat)[num_nonzeros >= 3]

# filter by these
bigmat <- bigmat[rownames(bigmat) %in% joint_filt_genes,]

# filter cells and match order to seurat
bigmat <- bigmat[,match(colnames(sobjint), colnames(bigmat))]
colnames(bigmat) <- colnames(sobjint)

#add as RNA assay
rnaassay <- Seurat::CreateAssayObject(counts = bigmat)
sobjint[['RNA']] <- rnaassay
DefaultAssay(sobjint) <- 'RNA'
sobjint <- NormalizeData(sobjint, assay = 'RNA')
DefaultAssay(sobjint) <- 'RISC'

#add in expm1 risc values to counts slot of risc assay
sobjint@assays$RISC@counts <- expm1(sobjint@assays$RISC@data)

#unlink temp concat mat
unlink( paste0(outdir_int_objects, '/.concatmatrix.rds') )

#clean env
invisible(gc(full = T, reset = F, verbose = F))
rm(bigmat, joint_filt_genes, num_nonzeros, matlist)




#factorize sample column to make sure it's the right order...
sobjint$Code <- factor(sobjint$Code, levels = sample_metadata$Code)

#factorize condiiton column to make sure it's the right order...
sobjint$Condition <- factor(sobjint$Condition, levels = levels(sample_metadata$Condition))


#update saved sobjint with predictions assay
saveRDS(sobjint,  paste0(outdir_int_objects, '/Seurat-object_integrated.rds') )


## clean up env
rm(predictionmats, intmd)

invisible(gc(full = T, reset = F, verbose = F))

7.1 Selection of reference sample for integration

RISC requires the selection of a reference sample from among the dataset. The optimal reference sample is one that has the most diversity of celltypes.

plotlab <- '
One way to select the reference sample is manual inspection of the "InPlot". This plot consists of three panels. The top panel is the most important, and shows the number of clusters detected in each sample across a range of different PC values. Essentially, this is a proxy of the number of cell types detected in the dataset, and the sample with most clusters should be selected as the reference. The second panel allows selection of the optimal PC value to use and the amount of variance each PC has for each sample, with good samples having a higher score (indicating more diversity). The third panel indicates bias of the gene signatures, samples with high scores here should not be selected as the reference.



'

cat(plotlab)

One way to select the reference sample is manual inspection of the “InPlot”. This plot consists of three panels. The top panel is the most important, and shows the number of clusters detected in each sample across a range of different PC values. Essentially, this is a proxy of the number of cell types detected in the dataset, and the sample with most clusters should be selected as the reference. The second panel allows selection of the optimal PC value to use and the amount of variance each PC has for each sample, with good samples having a higher score (indicating more diversity). The third panel indicates bias of the gene signatures, samples with high scores here should not be selected as the reference.

print(ip)

plotlab <- '

## Automated RISC reference selection

We have implemented an automated reference selection algorithm, extending the intuition laid out above. This is based on two metrics: 1) the number of clusters, and 2) the cluster-moderated sample variance. 

It is not enough to pick the sample with the most clusters, because sometimes samples have ties in the number of clusters, and cluster number alone can be proportional to number of cells. Thus, we weight the number or clusters by a metric of diversity in the sample which we call the cluster-moderated sample variance.

The cluster-moderated sample variance is calculated first by clustering each sample (ie, using the Seurat Louvain clusters described above), then averaging the sample at the cluster level. Then, the variance of each cluster is calculated, and finally, the mean of the cluster-wise variance is taken. 

Then, to calculate the reference selection score, we multiply the number of clusters per sample, times a relative weighing metric based on the number of cells in each sample relative to the sample with the most cells in the dataset (biggest gets a one, sampe with half cells of biggest gets a 0.5); then we multiply this value by the cluster-moderated variance described above.


We show here the resulting reference scores. The top score is selected as the reference. Please note that the autoselection is experimental, so if the score and selected reference deviates from InPlot, we recommend re-running the pipeline and manually selecting the optimal reference.

In this dataset, sample **%s** would be selected as the reference.


'

cat(sprintf(plotlab, names(maxautoscore)))

7.2 Automated RISC reference selection

We have implemented an automated reference selection algorithm, extending the intuition laid out above. This is based on two metrics: 1) the number of clusters, and 2) the cluster-moderated sample variance.

It is not enough to pick the sample with the most clusters, because sometimes samples have ties in the number of clusters, and cluster number alone can be proportional to number of cells. Thus, we weight the number or clusters by a metric of diversity in the sample which we call the cluster-moderated sample variance.

The cluster-moderated sample variance is calculated first by clustering each sample (ie, using the Seurat Louvain clusters described above), then averaging the sample at the cluster level. Then, the variance of each cluster is calculated, and finally, the mean of the cluster-wise variance is taken.

Then, to calculate the reference selection score, we multiply the number of clusters per sample, times a relative weighing metric based on the number of cells in each sample relative to the sample with the most cells in the dataset (biggest gets a one, sampe with half cells of biggest gets a 0.5); then we multiply this value by the cluster-moderated variance described above.

We show here the resulting reference scores. The top score is selected as the reference. Please note that the autoselection is experimental, so if the score and selected reference deviates from InPlot, we recommend re-running the pipeline and manually selecting the optimal reference.

In this dataset, sample Healthy_2 would be selected as the reference.

#format ref scores as data.frame
refscoredf <- data.frame('RefScore' = refscore)
refscoredf$Max <- ''
refscoredf[maxautoscore,'Max'] <- '*'
refscoredf$RefScore <- round(refscoredf$RefScore, 5)

#add "set"
refscoredf$Set <- paste0("Set-", 1:nrow(refscoredf))

#add sample and remove rownames
refscoredf$Code <- rownames(refscoredf)
rownames(refscoredf) <- NULL

#reorder
refscoredf <- refscoredf[,c('Code', 'Set', 'RefScore', 'Max')]


knitr::kable(refscoredf)
Code Set RefScore Max
Healthy_1 Set-1 8.21950
Healthy_2 Set-2 9.31144 *
Covid_Mild_1 Set-3 6.81909
Covid_Mild_2 Set-4 5.03081
Covid_Critical_1 Set-5 5.24910
Covid_Critical_2 Set-6 7.78851
if( !(is.null(risc_reference)) ){
  
  plotlab <- '
## Manual reference selection: %s

In this analysis, sample %s was manually selected as the reference sample. Sample %s will be used as the reference, regardless of the auto-selection described above.

'
  
  cat(sprintf(plotlab, risc_reference, risc_reference, risc_reference))
  
}
#set summaryplot dir
outdir_int_plots <- paste0(outdir_int, '/integration_summaryplots/')
dir.create(outdir_int_plots)


#first, plot the ref selection stuff: inplot and autoselect

pdf( paste0(outdir_int_plots, 'RISC_reference_selection.pdf'), height = 8, width = 6  )
print(ip)
if( !is.null(risc_reference) ){
  print(
    pdftable(refscoredf, 
             title = paste0('Manual reference selected\nSample: ',
                            risc_reference)
    )
  )
} else{
  print(
    pdftable(refscoredf, 
             title = paste0('RISC reference\nautoselection')
    )
  )
}

dev.off()


## for each sample get int clusters for alluvial plot
mdint <- sobjint@meta.data


## plots of the integrated data

#umap of clusters
clustname <- risc_clust_lab
d1_int <- DimPlot(sobjint, label = T, repel = T, group.by = clustname)

#umap of samples
d2_int <- DimPlot(sobjint, group.by = 'Code', split.by = 'Condition', ncol = 2) + 
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1.1))


#conditions
dcond <- DimPlot(sobjint, group.by = 'Condition')

#split by conditions
dcond_split <- DimPlot(sobjint, label = T, repel = T, 
                       group.by = clustname,
                       split.by = 'Condition',
                       ncol=2)+
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1.1))

#heatmap of cluster markers

DefaultAssay(sobjint) <- 'RISC'

n <- 5
top <- m_integrated_clusters %>% group_by(cluster) %>% top_n(n = n, wt = score)

genes <- top$gene

#scale the relevant genes
sobjint <- ScaleData(sobjint, features = genes, verbose = verbose)


# #make sure genes are in 
# if( any( !(genes %in% rownames(sobjint@assays$SCT@scale.data)) ) ){
#   
#   #try getresidual...
#   missinggenes <- genes[!(genes %in% rownames(sobjint@assays$SCT@scale.data))]
#   sobjint <- GetResidual(sobjint, missinggenes, na.rm = F, replace.value = T)
#   
#   #it can be complicated doing this after integration, some genes are NAs... 
#   scgem <- sobjint@assays$SCT@scale.data
#   
#   if( any( !complete.cases(scgem) ) ){
#     scgem <- scgem[complete.cases(scgem),]
#     top <- top[top$gene %in% rownames(scgem),]
#     sobjint@assays$SCT@scale.data <- scgem
#   }
#   rm(scgem)
#   
# }



#prep heatmap
top <- top[top$gene %in% rownames(sobjint),]
gem <- sobjint@assays$RISC@scale.data
gem <- gem[match(top$gene, rownames(gem)),]

#annot for clusters
#first order gem by cluster...
md <- sobjint@meta.data
md <- md[order(md$seurat_clusters),]
gem <- gem[,match(rownames(md), colnames(gem))]

clust_bc <- setNames(md$seurat_clusters,
                     nm = colnames(gem)
)

col_clust <- setNames(scales::hue_pal()(length(levels(sobjint$seurat_clusters))),
                      nm = levels(sobjint$seurat_clusters))

ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, col = list(Cluster = col_clust), show_legend = F)



#annot for markers

#set genes according to ct_ordered
top$cluster <-factor(top$cluster, levels=unique(top$cluster))
# top <- top[order(top$cluster),]
gem <- gem[match(top$gene, rownames(gem)),]

ct_gene <- setNames(top$cluster,
                    nm=top$gene)

col_gene <- col_clust
col_gene <- col_gene[names(col_gene) %in% top$cluster]

ha_genes <- ComplexHeatmap::rowAnnotation(Cluster = ct_gene, col = list(Cluster = col_gene),
                                          show_annotation_name=F)




#restrict range
gem[gem>5] <- 5
gem[gem<-5] <- -5



#actual heatmap
hm_int <- ComplexHeatmap::Heatmap(gem,
                                  # column_title = 'Integrated clusters',
                                  column_labels = rep('', ncol(gem)),
                                  row_names_gp = grid::gpar(fontsize = 5),
                                  column_split = md$seurat_clusters,
                                  row_split = top$cluster,
                                  row_title_gp = grid::gpar(fontsize = 5),
                                  row_gap = unit(0.8, "mm"), 
                                  column_gap = unit(0.8, "mm"),
                                  row_title_rot = 0,
                                  column_title_rot = 45,
                                  column_title_gp = grid::gpar(fontsize = 7),
                                  name = 'Scaled\nExpression',
                                  cluster_columns = F,
                                  cluster_rows = F,
                                  top_annotation = ha_clust,
                                  left_annotation = ha_genes,
                                  use_raster = F)




### marker avergae heatmap ###
#try to average the matrix
avgl <- lapply(levels(md$seurat_clusters), function(clust){
  mdc <- md[md$seurat_clusters==clust,]
  gemc <- gem[,colnames(gem) %in% rownames(mdc)]
  avg <- matrix(rowMeans(gemc), 
                dimnames = list(rownames(gem), clust))
  avg
})
avg <- do.call('cbind',avgl)

#need to re-prep column annot
clust_bc <- factor(str_sort(colnames(avg), numeric = T), levels = str_sort(colnames(avg), numeric = T))

col_clust <- setNames(scales::hue_pal()(length(levels(sobjint$seurat_clusters))),
                      nm = levels(sobjint$seurat_clusters))

ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, col = list(Cluster = col_clust), show_legend = F)


hm_int_avg <- ComplexHeatmap::Heatmap(avg,
                                      # column_title = 'Integrated clusters',
                                      row_names_gp = grid::gpar(fontsize = 6),
                                      column_names_gp = grid::gpar(fontsize = 6),
                                      column_split = factor(str_sort(colnames(avg), numeric = T), levels = str_sort(colnames(avg), numeric = T)), 
                                      column_title_rot = 0,
                                      column_names_rot = 0, 
                                      
                                      row_split = top$cluster,
                                      row_title_gp = grid::gpar(fontsize = 6),
                                      row_gap = unit(0.5, "mm"), 
                                      column_gap = unit(0.5, "mm"),
                                      row_title_rot = 0,
                                      name = 'Scaled\nExpression',
                                      cluster_columns = F,
                                      cluster_rows = F,
                                      top_annotation = ha_clust,
                                      left_annotation = ha_genes,
                                      use_raster = F)




## update Aug 8 add qc plots

int_qc_vln <- wrap_plots(ncol = 2, list(
  VlnPlot(sobjint, 'nCount_RNA', pt.size = 0.1)+NoLegend() ,
  VlnPlot(sobjint, 'nFeature_RNA', pt.size = 0.1)+NoLegend() ,
  VlnPlot(sobjint, 'percent.mito', pt.size = 0.1)+NoLegend() ,
  VlnPlot(sobjint, 'percent.hemoglobin', pt.size = 0.1)+NoLegend()
))









# DefaultAssay(sobjint) <- 'integrated'





### alluvial plot of condition to cluster

md <- sobjint@meta.data

labelsdf <- md[,c("Condition", clustname)]

ap_cond_to_clust <- alluvialplot(labelsdf)


### alluvial plot of sample to cluster
# do this for each condition separately

mdint <- sobjint@meta.data
ap_samp_to_clust_list <- lapply(unique(sample_metadata$Condition), function(cond){
  
  md <- mdint[mdint$Condition == cond,]
  
  
  labelsdf <- md[,c("Code", clustname)]
  #avoid relevlling, same colors for all gets confusing
  # levels(labelsdf$Code) <- levels(labelsdf$Code)[levels(labelsdf$Code) %in% labelsdf$Code]
  # labelsdf$Code <- factor(labelsdf$Code, levels = levels(labelsdf$Code)[levels(labelsdf$Code) %in% labelsdf$Code] )
  
  ap_samp_to_clust <- alluvialplot(labelsdf) + labs(title = cond) + NoLegend()
  
  return(ap_samp_to_clust)
  
})
names(ap_samp_to_clust_list) <- unique(sample_metadata$Condition)
ap_samp_to_clust <- wrap_plots(ap_samp_to_clust_list)





#loop thru each code, make alluvialplot of indi to int cluster
mdint <- sobjint@meta.data
indi_int_clust_ap_list <- lapply(sample_metadata$Code, function(code){
  md <- mdint[mdint$Code == code,]
  
  indiclustname <- paste0("SCT_snn_res.", res_indi)
  intclustname <- risc_clust_lab <- paste0('RISC_Louvain_npc', pcs_int, '_res', res_int)
  labelsdf <- md[,c(indiclustname, intclustname)]
  
  labelsdf[,1] <- factor(labelsdf[,1], levels = stringr::str_sort(unique(labelsdf[,1]), numeric = T) )
  
  names(labelsdf)[1] <- 'Individual Clusters'
  
  
  
  indi_int_clust_ap <- scDAPP::alluvialplot(labelsdf)
  
  
  return(indi_int_clust_ap)
  
})
names(indi_int_clust_ap_list) <- sample_metadata$Code


summaryplots_integrated <- list(dcond=dcond,
                                d2_int=d2_int,
                                d1_int=d1_int,
                                dcond_split=dcond_split,
                                ap_cond_to_clust=ap_cond_to_clust,
                                ap_samp_to_clust=ap_samp_to_clust,
                                hm_int=hm_int,
                                hm_int_avg=hm_int_avg,
                                int_qc_vln=int_qc_vln,
                                indi_int_clust_ap_list = indi_int_clust_ap_list)




### save pdfs to a subdir of outdir_int
outdir_int_plots <- paste0(outdir_int, '/integration_summaryplots/')

dir.create(outdir_int_plots)

pdf( paste0(outdir_int_plots, '/summaryplots_integrated.pdf'), height = 9, width=9 )

print(summaryplots_integrated)

dev.off()






### cell type plots

if(use_labeltransfer == T){
  
  
  
  #umap of cell type calls (calculated in individual samples)
  d3_int <- DimPlot(sobjint, label = T, repel = T, group.by = 'top_celltype_thresholded')
  
  #umap showing top cell type call score (calculated in individual samples)
  d4_int <- FeaturePlot(SetIdent(sobjint, value = 'top_celltype_thresholded', ), label = T, repel = T, features = 'top_celltype_call_seurat_score')
  
  
  #heatmap of the prediction scores per cluster
  # use scaled values, comparable between clusters
  # get avgs
  avgs <- AverageExpression(sobjint, assays = 'predictions', return.seurat = F)
  #remove max
  avgs <- head( as.data.frame(avgs) , -1)
  # remove empty prediction rows with all 0s
  avgs <- avgs[Matrix::rowSums(avgs)>0,]
  #make column titles nicer
  colnames(avgs) <- gsub('predictions.', 'cluster_', colnames(avgs))
  #format as numeric matrix
  avgs <- as.matrix(avgs)
  ### scale --> this emphasizes diffs between clusters, seems to put related cell types together
  avgs <- t(scale(t(avgs)))
  #select middle value  for color scale
  medval <- mean(avgs)
  #plot it
  hm_ctscores <- ComplexHeatmap::Heatmap(avgs, 
                                         name = 'Scaled\nmean prediction\nscores per cluster', 
                                         column_title = 'Integrated', 
                                         rect_gp = grid::gpar(col = "white", lwd = 0.5), 
                                         circlize::colorRamp2(c(min(avgs), medval, max(avgs)), c("blue", "white", "red")))
  
  
  
  #for dotplot, add dendrogram and row label order
  # it may throw a warning about drawing plot first etc, 
  #shouldn't be an issue if we set seed, which we did
  suppressWarnings(
    dend <- row_dend(hm_ctscores)
  )
  
  #get ordered row labels for dotplot and other plots
  suppressWarnings(
    ct_ordered <- rownames(hm_ctscores@matrix)[ComplexHeatmap::row_order(hm_ctscores)]
  )
  
  
  nicedend <- ggdendro::ggdendrogram(rev(dend), rotate = T) +   
    scale_y_reverse(expand = c(0.05, 0))+
    theme(axis.text.y = element_blank(),
          axis.text.x = element_blank())
  
  dp_ctscores <- DotPlot(sobjint, assay = 'predictions', rev(ct_ordered)) + 
    coord_flip() + 
    theme(axis.title.y=element_blank(),
          axis.text.y =  element_text(hjust = 0),
          axis.text.x = element_text(size=5) )+
    scale_color_gradient2(low = 'blue', high = 'red', mid = 'grey')+
    xlab(label = 'Cluster')+
    guides(color = guide_colorbar(title = "Scaled Average\nPrediction Score"))
  
  
  dp_ctscores <- patchwork::wrap_plots(list(nicedend, dp_ctscores), widths = c(0.3,1))
  
  
  
  #get exp of the celltypes
  rs <- Matrix::rowSums(sobjint@assays$predictions@data)
  rs <- head( rs , -1)
  
  #keep only celltypes that are exp
  rs <- sort(rs[rs>0], decreasing = T)
  ct_in <- names(rs)
  
  
  # heatmap of reference markers
  m_reference <- readRDS(m_reference_path)
  
  #make sure to remove underscores, since seurat doesn't like it in label transfer feature names
  m_ref_small <- m_reference
  
  #as of Nov 9 2023 (Seurat v5); add score
  m_ref_small$score <- (m_ref_small$pct.1 - m_ref_small$pct.2) * m_ref_small$avg_log2FC
  
  
  lt_ref_levs <- levels(m_ref_small$cluster)
  lt_ref_levs <- gsub('_', '-', lt_ref_levs)
  m_ref_small$cluster <- plyr::mapvalues(m_ref_small$cluster, from = levels(m_ref_small$cluster), lt_ref_levs)
  
  m_ref_small <- m_ref_small[m_ref_small$gene %in% rownames(sobjint),]
  m_ref_small <- m_ref_small[m_ref_small$cluster %in% ct_in,]
  n <- 5
  top <- m_ref_small %>% group_by(cluster) %>% top_n(n = n, wt = score)
  
  genes <- top$gene
  
  
  #scale the relevant genes
  sobjint <- ScaleData(sobjint, features = genes, verbose = verbose)
  
  #make sure genes are in 
  # if( any( !(genes %in% rownames(sobjint@assays$SCT@scale.data)) ) ){
  #   missinggenes <- genes[!(genes %in% rownames(sobjint@assays$SCT@scale.data))]
  #   sobjint <- GetResidual(sobjint, genes)
  # }
  
  #prep heatmap
  top <- top[top$gene %in% rownames(sobjint),]
  gem <- sobjint@assays$RISC@scale.data
  gem <- gem[match(top$gene, rownames(gem)),]
  
  #annot for clusters
  #first order gem by cluster...
  md <- sobjint@meta.data
  md <- md[order(md$seurat_clusters),]
  gem <- gem[,match(rownames(md), colnames(gem))]
  
  clust_bc <- setNames(md$seurat_clusters,
                       nm = colnames(gem)
  )
  
  col_clust <- setNames(scales::hue_pal()(length(levels(sobjint$seurat_clusters))),
                        nm = levels(sobjint$seurat_clusters))
  
  ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, col = list(Cluster = col_clust), show_legend = F)
  
  
  
  #annot for markers
  
  #set genes according to ct_ordered
  top$cluster <-factor(top$cluster, levels=ct_ordered)
  top <- top[order(top$cluster),]
  gem <- gem[match(top$gene, rownames(gem)),]
  
  ct_gene <- setNames(top$cluster,
                      nm=top$gene)
  coul <- RColorBrewer::brewer.pal(8, "Set2") 
  coul <- colorRampPalette(coul)(length(unique(top$cluster)))
  col_gene <- setNames(coul, nm = unique(top$cluster))
  
  ha_genes <- ComplexHeatmap::rowAnnotation(Celltype = ct_gene, col = list(Celltype = col_gene),
                                            show_annotation_name=F)
  
  
  #restrict range
  gem[gem>5] <- 5
  gem[gem<-5] <- -5
  
  
  #actual heatmap
  hm_refmarkers <- ComplexHeatmap::Heatmap(gem,
                                           # column_title = 'Integrated clusters',
                                           column_labels = rep('', ncol(gem)),
                                           row_names_gp = grid::gpar(fontsize = 5),
                                           column_split = md$seurat_clusters,
                                           row_split = top$cluster,
                                           row_title_gp = grid::gpar(fontsize = 5),
                                           row_gap = unit(0.8, "mm"), 
                                           column_gap = unit(0.8, "mm"),
                                           row_title_rot = 0,
                                           column_title_rot = 45,
                                           column_title_gp = grid::gpar(fontsize = 7),
                                           name = 'Scaled\nExpression',
                                           cluster_columns = F,
                                           cluster_rows = F,
                                           top_annotation = ha_clust,
                                           left_annotation = ha_genes,
                                           use_raster = F)
  
  
  
  
  #try to average the matrix
  avgl <- lapply(levels(md$seurat_clusters), function(clust){
    mdc <- md[md$seurat_clusters==clust,]
    gemc <- gem[,colnames(gem) %in% rownames(mdc)]
    avg <- matrix(rowMeans(gemc), 
                  dimnames = list(rownames(gem), clust))
    avg
  })
  avg <- do.call('cbind',avgl)
  
  #need to re-prep column annot
  clust_bc <- factor(str_sort(colnames(avg), numeric = T), levels = str_sort(colnames(avg), numeric = T))
  
  col_clust <- setNames(scales::hue_pal()(length(levels(sobjint$seurat_clusters))),
                        nm = levels(sobjint$seurat_clusters))
  
  ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, col = list(Cluster = col_clust))
  
  
  hm_refmarkers_avg <- ComplexHeatmap::Heatmap(avg,
                                               # column_title = 'Integrated clusters',
                                               row_names_gp = grid::gpar(fontsize = 6),
                                               column_names_gp = grid::gpar(fontsize = 6),
                                               column_split = factor(str_sort(colnames(avg), numeric = T), levels = str_sort(colnames(avg), numeric = T)), 
                                               column_title_rot = 0,
                                               column_names_rot = 0, 
                                               
                                               row_split = top$cluster,
                                               row_title_gp = grid::gpar(fontsize = 6),
                                               row_gap = unit(0.5, "mm"), 
                                               column_gap = unit(0.5, "mm"),
                                               row_title_rot = 0,
                                               name = 'Scaled\nExpression',
                                               cluster_columns = F,
                                               cluster_rows = F,
                                               top_annotation = ha_clust,
                                               left_annotation = ha_genes,
                                               use_raster = F)
  
  
  
  
  
  
  #alluvial plot, int clusters to cell type
  labelsdf <- mdint[,c('seurat_clusters', 'top_celltype_thresholded')]
  ap_int_celltypes <- alluvialplot(labelsdf)
  
  
  
  
  
  # cluster-celltype mapping table; still in mem.
  # clustmaxdf
  
  # cluster-celltypr mapped umap
  d5 <- DimPlot(sobjint, group.by = 'INT_celltype_cluster_prediction',
                label = T, repel = T)
  
  
  # vlnplot of score over cluster
  # one plot per condition?
  
  #get celltypes
  ct_ordered <- rownames(hm_ctscores@matrix)[ComplexHeatmap::row_order(hm_ctscores)]
  
  
  #split samples by condition
  
  DefaultAssay(sobjint) <- 'predictions'
  
  
  #make plots of integrated samples
  ctplots_int <- lapply(ct_ordered, function(ct){
    
    
    vct <- VlnPlot(sobjint, ct, pt.size = 0.1)
    
    
    list(vct=vct)
    
    
    
    
  })
  
  names(ctplots_int) <- ct_ordered
  
  DefaultAssay(sobjint) <- 'RISC'
  
  
  
  
  summaryplots_integrated_ct <- list(
    
    hm_ctscores=hm_ctscores,
    dp_ctscores=dp_ctscores,
    
    hm_refmarkers=hm_refmarkers,
    hm_refmarkers_avg=hm_refmarkers_avg,
    
    
    
    d3_int=d3_int,
    d4_int=d4_int,
    ap_int_celltypes=ap_int_celltypes,
    
    clustmaxdf = clustmaxdf,
    d5=d5,
    
    ctplots_int=ctplots_int
    
  )
  
  
  
  
  
  
  pdf( paste0(labeltransfer_outdir_int, '/LabelTransferSummaryPlots.pdf'), height = 9, width=9 )
  
  
  for(i in 1:length(summaryplots_integrated_ct) ){
    
    if(names(summaryplots_integrated_ct)[i] == 'clustmaxdf'){
      print( scDAPP::pdftable(summaryplots_integrated_ct[[i]], title = 'Cluster-Celltype Mapping') )
    } else{
      print(summaryplots_integrated_ct[[i]])
    }
    
  }
  
  dev.off()
  
  
}

8 Integrated cluster summary plots

plotlab <- "


## UMAP colored by condition

Here we plot a UMAP of the integrated dataset colored according to the sample’s condition. Overall we expect to observe a good overlap, while some sections may be quite distinct. This can correspond to clusters of cell states or cell types that are present in one sample but absent from another.


"

cat( plotlab )

8.1 UMAP colored by condition

Here we plot a UMAP of the integrated dataset colored according to the sample’s condition. Overall we expect to observe a good overlap, while some sections may be quite distinct. This can correspond to clusters of cell states or cell types that are present in one sample but absent from another.

print( summaryplots_integrated$dcond )

plotlab <- "


## UMAP split by condition and colored by sample

Here, the integrated UMAP is split by condition, meaning that cells from Condition A are separated from cells from condition B. Additionally, cells are colored according to their sample origin. This allows us to check the overlap of biological replicates from the same condition.


"

cat( plotlab )

8.2 UMAP split by condition and colored by sample

Here, the integrated UMAP is split by condition, meaning that cells from Condition A are separated from cells from condition B. Additionally, cells are colored according to their sample origin. This allows us to check the overlap of biological replicates from the same condition.

print( summaryplots_integrated$d2_int )

plotlab <- "


## UMAP colored by integrated clusters

Here we plot the same UMAP but this time colored by the integrated clusters. These clusters are important to characterize, as these are the groups that we will compare one-by-one between conditions.


"

cat( plotlab )

8.3 UMAP colored by integrated clusters

Here we plot the same UMAP but this time colored by the integrated clusters. These clusters are important to characterize, as these are the groups that we will compare one-by-one between conditions.

print( summaryplots_integrated$d1_int )

plotlab <- "


## UMAP split by condition and colored by integrated clusters

Here we plot the UMAP split by conditon and colored by integrated clusters. This is useful to see compositional differences in which clusters may be enriched or depleted between conditions.


"

cat( plotlab )

8.4 UMAP split by condition and colored by integrated clusters

Here we plot the UMAP split by conditon and colored by integrated clusters. This is useful to see compositional differences in which clusters may be enriched or depleted between conditions.

print( summaryplots_integrated$dcond_split )

plotlab <- "


## Heatmap of integrated cluster markers

Here we plot the top 5 markers per cluster for the integrated clusters.


"

cat( plotlab )

8.5 Heatmap of integrated cluster markers

Here we plot the top 5 markers per cluster for the integrated clusters.

print( summaryplots_integrated$hm_int )

plotlab <- "


## Averaged heatmap of integrated cluster markers

Here we plot the top 5 markers per cluster for the integrated clusters after averaging over each cluster. It is similar to the marker heatmap above, but can help visualize the cluster-wise expression by plotting the cluster averages instead of each individual cell.


"

cat( plotlab )

8.6 Averaged heatmap of integrated cluster markers

Here we plot the top 5 markers per cluster for the integrated clusters after averaging over each cluster. It is similar to the marker heatmap above, but can help visualize the cluster-wise expression by plotting the cluster averages instead of each individual cell.

print( summaryplots_integrated$hm_int_avg )

plotlab <- "


## Violin Plots of quality metrics

Here we plots some quality metrics for each integrated cluster including number of UMIs per cell (nCount_RNA), number of unique genes detected per cell (nFeature_RNA), percent mitochondrial gene expression (percent.mito), and percent hemoglobin gene expression (percent.hemoglobin).


"

cat( plotlab )

8.7 Violin Plots of quality metrics

Here we plots some quality metrics for each integrated cluster including number of UMIs per cell (nCount_RNA), number of unique genes detected per cell (nFeature_RNA), percent mitochondrial gene expression (percent.mito), and percent hemoglobin gene expression (percent.hemoglobin).

print( summaryplots_integrated$int_qc_vln )

plotlab <- "


## Alluvial plot mapping condition to integrated clusters

This alluvial plot allows us to see the contribution of each condition to each cluster. This allows us to visualize compositional differences between conditions clearly.


"

cat( plotlab )

8.8 Alluvial plot mapping condition to integrated clusters

This alluvial plot allows us to see the contribution of each condition to each cluster. This allows us to visualize compositional differences between conditions clearly.

print( summaryplots_integrated$ap_cond_to_clust )

plotlab <- "


## Alluvial plot mapping samples to integrated clusters

This alluvial plot allows us to see the contribution of each sample to each cluster. Combined with the condition-focused alluvial plot above, this allows us to verify compositional patterns across biological replicates.


"

cat( plotlab )

8.9 Alluvial plot mapping samples to integrated clusters

This alluvial plot allows us to see the contribution of each sample to each cluster. Combined with the condition-focused alluvial plot above, this allows us to verify compositional patterns across biological replicates.

print( summaryplots_integrated$ap_samp_to_clust )

plotlab <- "


## Per-sample alluvial plots mapping indiviudal sample clusters to integrated clusters

For each sample, we plot an alluvial plot mapping the individual sample clusters with the integrated clusters.


"

cat( plotlab )

8.10 Per-sample alluvial plots mapping indiviudal sample clusters to integrated clusters

For each sample, we plot an alluvial plot mapping the individual sample clusters with the integrated clusters.

indi_int_clust_ap_list <- summaryplots_integrated$indi_int_clust_ap_list



for(i in c(1:length(indi_int_clust_ap_list)) ){
  
  name=names(indi_int_clust_ap_list)[i]
  
  plotlab <- "

### %s

Individual Clusters (left) mapped to Integrated Clusters (right)
  
"
  
  cat(sprintf(plotlab, name))
  
  print(indi_int_clust_ap_list[[i]])
  
}

8.10.1 Healthy_1

Individual Clusters (left) mapped to Integrated Clusters (right)

8.10.2 Healthy_2

Individual Clusters (left) mapped to Integrated Clusters (right)

8.10.3 Covid_Mild_1

Individual Clusters (left) mapped to Integrated Clusters (right)

8.10.4 Covid_Mild_2

Individual Clusters (left) mapped to Integrated Clusters (right)

8.10.5 Covid_Critical_1

Individual Clusters (left) mapped to Integrated Clusters (right)

8.10.6 Covid_Critical_2

Individual Clusters (left) mapped to Integrated Clusters (right)

rm(indi_int_clust_ap_list)


# print( summaryplots_integrated$indi_int_clust_ap_list )
if(use_labeltransfer == T){
  
  
  
  text <- '

# Integrated label transfer summary

Above, we explored the integrated clusters. Here, we review the cell type scores from label transfer. The goal is to understand which cell types make up each cluster.

Each cell is a mixture of cells. Using label transfer, we can get a score for each cell type in the tissue. If a cell is composed of one cell type, it will have a high score for that cell type only. If a cell is composed of a mix of cell types, it may have a moderately high score for two or more cell types.

As mentioned above, each cell may contain a single cell or a mixture of cells and potentially multiple cell types. Thus, assigning a single label to each cell is difficult. Nevertheless, we use the label transfer scores to detect patterns of cell types in each cluster.

Note that we apply a label transfer score cutoff of 0.3. If the cells are below this, they are marked as NA. These may represent cells missing from the reference.

'
  
  cat(text)
  
}

9 Integrated label transfer summary

Above, we explored the integrated clusters. Here, we review the cell type scores from label transfer. The goal is to understand which cell types make up each cluster.

Each cell is a mixture of cells. Using label transfer, we can get a score for each cell type in the tissue. If a cell is composed of one cell type, it will have a high score for that cell type only. If a cell is composed of a mix of cell types, it may have a moderately high score for two or more cell types.

As mentioned above, each cell may contain a single cell or a mixture of cells and potentially multiple cell types. Thus, assigning a single label to each cell is difficult. Nevertheless, we use the label transfer scores to detect patterns of cell types in each cluster.

Note that we apply a label transfer score cutoff of 0.3. If the cells are below this, they are marked as NA. These may represent cells missing from the reference.

if(use_labeltransfer == T){
  
  
  
  
  plotlab <- "


## Heatmap of cell type label transfer scores with integrated clusters

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.


"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$hm_ctscores)
  
  
  
  plotlab <- "


## Dotplot of label transfer cell type scores with integrated clusters

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above. However, the dotplot also includes information about how many cells express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A small dot indicates few cells express the score. 


"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$dp_ctscores)
  
  
  
  
  
  
  plotlab <- "


## Heatmap of reference cell type markers in integrated data

Here we plot the top 5 markers of cell types as sorted by average log2 fold change, which  are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type, for example a cluster with high T cell score should express high T cell markers.


"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$hm_refmarkers)
  
  
  plotlab <- "


## Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.


"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$hm_refmarkers_avg)
  
  
  
  
  
  
  
  
  
  
  plotlab <- "


## UMAP of Top Scoring Celltype

The top scoring celltype is plotted on the UMAP.


"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$d3_int)
  
  
  plotlab <- "


## UMAP of label transfer prediction score

The label transfer prediction score is plotted on the UMAP. This is a measure of confidence. Low scoring cells may indicate a new cell in the data that was not present in the reference.


"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$d4_int)
  
  
  
  plotlab <- "


## Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster.




"
  
  cat( plotlab )
  
  print(summaryplots_integrated_ct$ap_int_celltypes)
  
  
  
  
  plotlab <- '


## Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset.
Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix "_putative" to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

'
  
  cat( plotlab )
  
  clustmaxprint <- summaryplots_integrated_ct$clustmaxdf
  clustmaxprint$score <- round(clustmaxprint$score, digits = 5)
  print(knitr::kable(clustmaxprint))
  
  
  
  plotlab <- "


## UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.



"
  
  cat( plotlab )
  
  
  print(summaryplots_integrated_ct$d5)
  
  
  
  
  
  #CELLTYPE FEATUREPLOTS
  
  plotlab <- "


## Reference cell types plotted for integrated data for all cell types

Here we plot a summary panel to analyze the label transfer scores for each cell type in detail. To help identify which cluster strongly matches the cell type, the scores are plotted as Violin Plots.


"
  
  cat( plotlab )
  
  
  template_int_celltypes <- "


### %s


"
  
  invisible(
    
    for( i in c(1:length(summaryplots_integrated_ct$ctplots_int)) ){
      
      ct = names(summaryplots_integrated_ct$ctplots_int)[i]
      
      cat(sprintf(template_int_celltypes, ct))
      
      thisct_plots <- summaryplots_integrated_ct$ctplots_int[[i]]
      
      print(thisct_plots$vct)
      
      # invisible(lapply(thisct_plots$ctplots_conds, print))
      
      
      
    }
    
  )
  
  
}

9.1 Heatmap of cell type label transfer scores with integrated clusters

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters.

9.2 Dotplot of label transfer cell type scores with integrated clusters

Here we plot the label transfer scores, which are used to predict cell types from the reference single-cell RNAseq dataset. The values are scaled, allowing easy comparison across clusters. It is very similar to the heatmap above. However, the dotplot also includes information about how many cells express the cell type score. A big dot indicates many cells in that cluster express it at non-zero level. A big gray dot indicates widespread, low score among cells. A big red dot indicates widespread, high score among cells. A small dot indicates few cells express the score.

9.3 Heatmap of reference cell type markers in integrated data

Here we plot the top 5 markers of cell types as sorted by average log2 fold change, which are derived from the reference dataset. Since these are the top specific markers of each cell type in the reference, their expression pattern should match clusters with a high label transfer score for each given cell type, for example a cluster with high T cell score should express high T cell markers.

9.4 Heatmap of reference cell type markers averaged across clusters

Here we plot the top 5 reference cell type markers by average log2 fold change, this time using the average cluster expression. The markers are derived from the reference dataset. Using averages allows us to easily see which clusters highly or lowly express the reference cell type markers.

9.5 UMAP of Top Scoring Celltype

The top scoring celltype is plotted on the UMAP.

9.6 UMAP of label transfer prediction score

The label transfer prediction score is plotted on the UMAP. This is a measure of confidence. Low scoring cells may indicate a new cell in the data that was not present in the reference.

9.7 Alluvial plot mapping between Louvain clusters to top celltypes

This alluvial plot maps cluster labels to top celltype labels for each cell. This can be useful to identify general trends of celltype score per cluster.

9.8 Table showing cluster-celltype mapping

We apply an ad-hoc method to extend the label transfer from individual cells (default) across entire clusters. We do this by taking the average score of each cell type score in the cluster and choosing the max. This can help simplify the analysis and can be less noisy by sharing information across cells, but may also hide intra-cluster variability, such as particular sub-clusters which may correspond to distinct cell sub-types.

We also apply a thresholding approach such that clusters with a max score of less than 0.3 are considered non-transferrable and marked as NA (not applicable). These may reflect clusters of cells which are not in the reference dataset. Additionally, clusters with a score between 0.3 - 0.5 are marked with the suffix “_putative” to indicate the somewhat uncertain status. All cluster markers should be carefully inspected and checked for cell type, independnetly of the label transfer results.

cluster max score label
1 NK 0.77652 NK
2 B 1.00000 B
3 T 0.99582 T
4 MNP 0.99599 MNP
5 MNP 1.00000 MNP
6 MNP 0.94171 MNP
7 T 0.78191 T

9.9 UMAP of cluster-celltype mapping

As described above, we extend the label transfer prediction from individual cells to whole clusters in order to denoise and simplify the analysis. Here, rather than plotting the individual cell predictions, we plot the cluster-level predictions.

9.10 Reference cell types plotted for integrated data for all cell types

Here we plot a summary panel to analyze the label transfer scores for each cell type in detail. To help identify which cluster strongly matches the cell type, the scores are plotted as Violin Plots.

9.10.1 MNP

9.10.2 T

9.10.3 NK

9.10.4 B

10 Compositional analysis

Here, we test for differences in the abundance of celltypes (clusters) between conditions. This is referred to as compositional analysis.

If we are comparing two conditions such as treated versus untreated or knockout vs wildtype, one biological effect we can try to test for is the difference in the cell type compositional abundance. The idea that tissue cell type compositions can change in response to biological stimuli has been called “polycreodism” and may reflect particular cell type sensitivity to the perturbation (Lappalainen & Greally 2017). For example, it would likely of interest to note that a knockout or drug completely ablates a certain celltype, reduces another by half, increases another by 25%, and leaves others unchanged. Biologically, these kinds of differences can be explained by various mechanisms, such as differentiation of certain cell types being specifically impaired by a knockout, or specific cell types being particularly sensitive to a drug.

We use the scRNA-seq data to test for differential compositional abundance of each cluster. If replicates are present, we make use of these to test differential abdundance across conditions using the “Propeller” method from the R package “speckle” (Phipson et al 2022). This method calculates cell proportions for each sample, applies a variance stabilizing transformation, and then compares proportion across conditions via linear regression. It is similar to pseudobulk differential expression analysis in that it takes into accounts having multiple biological replicate samples for each condition. Specifically, we use the arcsin square root transformation by setting “transform = asin”. We picked this test and this transformation based on the results of a benchmarking study comparing compositional analysis methods (Simmons 2022). Though Propeller calculates False Discovery Rates (FDR), we define significance using only nominal P value for power reasons.

If replicates are not available, we use an ad-hoc method that relies on the R prop.test() function, which is very similar to the ChiSquare test (see this forum discussion). This works by calculating the proportion of cells in each condition and comparing the proportions. Because there are often thousands of cells being compared, P values can be extremely low. However, this method can result in false-positive conclusions, as the results of 1-vs-1 sample comparisons may not extend to the populations the samples are drawn from. Thus, we advise caution, and interpretation without replicates should be considered as preliminary / pilot data.

compositional_test <- ifelse(Pseudobulk_mode == T, yes = 'propeller', no = 'chisq' )

comp_result <- compositional_analysis_module(sobjint = sobjint,
                                             comps = comps,
                                             sample_metadata = sample_metadata,
                                             outdir_int = outdir_int,
                                             grouping_variable = 'seurat_clusters',
                                             compositional_test = compositional_test)


composition_comps <- comp_result$composition_comps
globalcomposition <- comp_result$globalcomposition

rm(comp_result)
plotlab <- '
## Global cell numbers and proportions

Here we show tables of cell numbers and proportions. These can be used to calculate proportion shifts.
'

cat(plotlab)

10.1 Global cell numbers and proportions

Here we show tables of cell numbers and proportions. These can be used to calculate proportion shifts.

plotlab <- '
### Table of cell numbers

We display a table of cell numbers for each sample and cluster.
'
cat(plotlab)

10.1.1 Table of cell numbers

We display a table of cell numbers for each sample and cluster.

cellstab <- as.data.frame.matrix(globalcomposition$cellstab)
cellstab <- cbind(rownames(cellstab), cellstab)
colnames(cellstab)[1] <- 'Cluster'
knitr::kable(cellstab)
Cluster Healthy_1 Healthy_2 Covid_Mild_1 Covid_Mild_2 Covid_Critical_1 Covid_Critical_2
1 254 196 236 282 83 201
2 199 187 193 98 191 193
3 132 183 155 102 186 158
4 111 141 148 170 180 162
5 47 17 12 8 0 3
6 7 17 9 6 2 2
7 0 1 3 1 7 25
plotlab <- '
### Table of cell proportions

We display a table of cell proportions for each sample and cluster. This is calculated by dividing each column by the sample total. We then round to the third digit for display purposes (though the full proportion table is saved in outputs).
'
cat(plotlab)

10.1.2 Table of cell proportions

We display a table of cell proportions for each sample and cluster. This is calculated by dividing each column by the sample total. We then round to the third digit for display purposes (though the full proportion table is saved in outputs).

proptab <- as.data.frame.matrix(globalcomposition$proptab)
printproptab <- round(proptab, 3)
printproptab <- cbind(rownames(printproptab), printproptab)
colnames(printproptab)[1] <- 'Cluster'

knitr::kable(printproptab)
Cluster Healthy_1 Healthy_2 Covid_Mild_1 Covid_Mild_2 Covid_Critical_1 Covid_Critical_2
1 0.339 0.264 0.312 0.423 0.128 0.270
2 0.265 0.252 0.255 0.147 0.294 0.259
3 0.176 0.247 0.205 0.153 0.287 0.212
4 0.148 0.190 0.196 0.255 0.277 0.218
5 0.063 0.023 0.016 0.012 0.000 0.004
6 0.009 0.023 0.012 0.009 0.003 0.003
7 0.000 0.001 0.004 0.001 0.011 0.034
plotlab <- '
### Heatmap of cell proportions in each sample

Here we display a heatmap-style table of cell proportions for each sample and cluster. It is similar to the table above, but we have turned on hierarchical clustering of the rows, to try to visualize patterns of abundance among the clusters.
'
cat(plotlab)

10.1.3 Heatmap of cell proportions in each sample

Here we display a heatmap-style table of cell proportions for each sample and cluster. It is similar to the table above, but we have turned on hierarchical clustering of the rows, to try to visualize patterns of abundance among the clusters.

print(globalcomposition$hmprop)

plotlab <- '
## Differential abundance compositional analysis across conditions

Here we summarize differential abundance across conditions as defined in the cross-condition comparisons provided.
'

cat(plotlab)

10.2 Differential abundance compositional analysis across conditions

Here we summarize differential abundance across conditions as defined in the cross-condition comparisons provided.

if(Pseudobulk_mode == T){
  plotlab <- '
Replicates were provided, so we run the propeller test to take into account abundance in each replicate in the cross-condition comparison.
'
} else{
  plotlab <- '
This analysis was run with the prop.test() function in R to compare overall cell proportions between conditions.
'
}

cat(plotlab)

Replicates were provided, so we run the propeller test to take into account abundance in each replicate in the cross-condition comparison.

compslen <- 1:nrow(comps)
compidx = 1 #for testing
#prep names
comps$labels <- paste0(comps$c1, '_vs_', comps$c2)

for(compidx in 1:length(compslen) ){
  
  
  #get comparison condition levels
  c1 <- comps[compidx,1]
  c2 <- comps[compidx,2]
  
  #get comp lab
  lab <- comps[compidx,3]
  
  #get comp analysis
  diffcomp <- composition_comps[[lab]]
  
  #get testused
  comptestused <- ifelse(T == T,
                         yes = 'propeller', 
                         no = 'R prop.test()')
  
  
  
  plotlab <- '
### %s
'
  cat(sprintf(plotlab, lab))
  
  
  ## print heatmap
  
  plotlab <- '
#### Heatmap of compositional analysis results

Here we plot a heatmap of the cell proportions and compositional analysis from the %s analysis.

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05.

'
  cat(sprintf(plotlab, comptestused))
  print(diffcomp$hmprop_comp)
  
  
  
  
  ## print table
  
  plotlab <- '

#### Table of compositional analysis results

Here we plot a table of the cell proportions and compositional analysis from the %s analysis. We round to the 3rd digit for display, which can result in low P values being shown as zeros (though the full analysis is saved in the output folder).

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05. FDR is not used to define significance but is provided for convenience.

'
  
  cat(sprintf(plotlab, comptestused))
  print_compres <- diffcomp$compres
  print_compres[,-1] <- round(print_compres[,-1], 3)
  print(knitr::kable(print_compres))
  
}

10.2.1 Covid_Critical_vs_Healthy

10.2.1.1 Heatmap of compositional analysis results

Here we plot a heatmap of the cell proportions and compositional analysis from the propeller analysis.

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05.

10.2.1.2 Table of compositional analysis results

Here we plot a table of the cell proportions and compositional analysis from the propeller analysis. We round to the 3rd digit for display, which can result in low P values being shown as zeros (though the full analysis is saved in the output folder).

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05. FDR is not used to define significance but is provided for convenience.

BaselineProp.clusters BaselineProp.Freq PropMean.Covid_Critical PropMean.Healthy PropRatio Tstatistic P.Value FDR
7 * 7 * 0.011 0.022 0.001 32.936 2.231 0.043 0.104
4 4 0.206 0.248 0.169 1.465 1.721 0.107 0.188
3 3 0.228 0.249 0.211 1.181 0.809 0.432 0.504
2 2 0.267 0.277 0.259 1.070 0.361 0.723 0.723
1 * 1 * 0.254 0.199 0.301 0.660 -2.204 0.045 0.104
6 6 0.010 0.003 0.016 0.179 -1.253 0.231 0.323
5 * 5 * 0.023 0.002 0.043 0.047 -3.027 0.009 0.063

10.2.2 Covid_Mild_vs_Healthy

10.2.2.1 Heatmap of compositional analysis results

Here we plot a heatmap of the cell proportions and compositional analysis from the propeller analysis.

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05.

10.2.2.2 Table of compositional analysis results

Here we plot a table of the cell proportions and compositional analysis from the propeller analysis. We round to the 3rd digit for display, which can result in low P values being shown as zeros (though the full analysis is saved in the output folder).

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05. FDR is not used to define significance but is provided for convenience.

BaselineProp.clusters BaselineProp.Freq PropMean.Covid_Mild PropMean.Healthy PropRatio Tstatistic P.Value FDR
7 7 0.002 0.003 0.001 4.057 0.630 0.539 0.629
4 4 0.196 0.225 0.169 1.333 1.371 0.192 0.345
1 1 0.332 0.367 0.301 1.219 1.353 0.197 0.345
3 3 0.196 0.179 0.211 0.847 -0.784 0.446 0.625
2 2 0.232 0.201 0.259 0.777 -1.394 0.185 0.345
6 6 0.013 0.010 0.016 0.648 -0.430 0.674 0.674
5 5 0.029 0.014 0.043 0.326 -1.635 0.124 0.345

10.2.3 Covid_Critical_vs_Covid_Mild

10.2.3.1 Heatmap of compositional analysis results

Here we plot a heatmap of the cell proportions and compositional analysis from the propeller analysis.

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05.

10.2.3.2 Table of compositional analysis results

Here we plot a table of the cell proportions and compositional analysis from the propeller analysis. We round to the 3rd digit for display, which can result in low P values being shown as zeros (though the full analysis is saved in the output folder).

Clusters with significant compositional differences are labelled with two asterisks (ie * 1 *). Significance is defined as P < 0.05. FDR is not used to define significance but is provided for convenience.

BaselineProp.clusters BaselineProp.Freq PropMean.Covid_Critical PropMean.Covid_Mild PropRatio Tstatistic P.Value FDR
7 7 0.013 0.022 0.003 8.119 2.168 0.089 0.208
3 3 0.213 0.249 0.179 1.394 1.752 0.147 0.243
2 2 0.240 0.277 0.201 1.377 1.602 0.177 0.243
4 4 0.234 0.248 0.225 1.099 0.565 0.599 0.599
1 1 0.285 0.199 0.367 0.542 -2.460 0.063 0.208
6 6 0.007 0.003 0.010 0.276 -1.471 0.208 0.243
5 5 0.008 0.002 0.014 0.145 -2.195 0.086 0.208
### clean up env
rm(print_compres, diffcomp, composition_comps, globalcomposition)
invisible(gc(full = T, reset = F, verbose = F))

11 Differential expression across conditions

Below we summarize the results of cross-condition differential expression (DE) analysis, for example KO vs WT, or disease vs healthy, or drugged vs control. As in single-cell data, we stratify the cross-condition DE by cluster. In other words, using the clusters defined above, we compare KO vs WT in cluster 1, cluster 2, and so on. For these types of analysis, it is recommended to have multiple biological replicates per condition.

If multiple replicates are available, then the best way to take advantage of these is to use a “pseudobulk” strategy. This refers to an approach in which cells from each sample are pooled together, and bulk RNA-seq analysis methods are used to identify DE genes across conditions. Pseudobulk differential expression analysis is done with EdgeR likelihood ratio test, as recommended by Squair et al 2021 Nat Com.

If replicates are not available, it is still possible to compare conditions. This is done using a wilcoxon test approach. Drawing conclusions from the results of this type of analysis are inherently limited as they constitute “N of 1” experiments, but can be considered as preliminary/pilot data.

if(Pseudobulk_mode == T){
  
  detestused_text <- 'The differential expression analysis here used a pseudobulk edgeR approach. First, cells are "pseudobulked", or combined,  at the cluster level by adding up the gene UMI counts in all cells in each cluster for each replicate. Next, EdgeR with the likelihood ratio test is applied.'
  
}
if(Pseudobulk_mode == F){
  
  detestused_text <- 'The differential expression analysis here used a non-pseudobulk Wilcoxon test approach. The normalized counts of each gene are compared between cells in condition A vs B in each cluster.'
  
}


cat(detestused_text)

The differential expression analysis here used a pseudobulk edgeR approach. First, cells are “pseudobulked”, or combined, at the cluster level by adding up the gene UMI counts in all cells in each cluster for each replicate. Next, EdgeR with the likelihood ratio test is applied.

#prep names
comps$labels <- paste0(comps$c1, '_vs_', comps$c2)

#read sobjlist back in? keep it in?
# will need to optimize memory

#get cluster object name
clustname <- risc_clust_lab


if(Pseudobulk_mode == T){
  
  m_bycluster_crosscondition_de_comps <- de_across_conditions_module(
    sobjint = sobjint,
    sample_metadata = sample_metadata,
    comps = comps,
    grouping_variable = clustname,
    outdir_int=outdir_int,
    assay = 'RNA', slot = 'counts',
    Pseudobulk_mode = T,
    cluster_prefix = T,
    crossconditionDE_padj_thres = crossconditionDE_padj_thres,
    crossconditionDE_lfc_thres = crossconditionDE_lfc_thres,
    crossconditionDE_min.pct = 0.1
  )
  
} else{
  
  m_bycluster_crosscondition_de_comps <- de_across_conditions_module(
    sobjint = sobjint,
    sample_metadata = sample_metadata,
    comps = comps,
    grouping_variable = clustname,
    outdir_int=outdir_int,
    assay = 'RISC', slot = 'data',
    Pseudobulk_mode = F,
    cluster_prefix = T,
    crossconditionDE_padj_thres = crossconditionDE_padj_thres,
    crossconditionDE_lfc_thres = crossconditionDE_lfc_thres,
    crossconditionDE_min.pct = 0
  )
  
}
compslen <- 1:nrow(comps)
compidx = 1 #for testing


outdir_DE <- paste0(outdir_int, '/differentialexpression_crosscondition/')
outdir_DE_plots <- paste0(outdir_DE, '/plots/')
dir.create(outdir_DE_plots, recursive = T)


DEplots_comps <- lapply(compslen, function(compidx){
  
  
  #get comparison condition levels
  c1 <- comps[compidx,1]
  c2 <- comps[compidx,2]
  
  #get comp lab
  lab <- comps[compidx,3]
  
  message(lab)
  
  #get cross conditions res per cluster list
  m_bycluster_crosscondition_de <- m_bycluster_crosscondition_de_comps[[compidx]]
  
  
  #get num DEGs
  crossconditionDE_min.pct <- ifelse(Pseudobulk_mode == T, yes = 0.1, no = 0)
  numdegs <- sapply(m_bycluster_crosscondition_de, function(m){
    
    #normal fdr and padj thresholds
    m <- m[m$FDR < crossconditionDE_padj_thres,, drop=F]
    m <- m[abs(m$logFC) > crossconditionDE_lfc_thres,, drop=F]
    
    #pct thresholds: +FC, pct1 > 0.1; -FC, pct2 > 0.1
    
    upm <- m[m$logFC > 0,,drop=F]
    upm <- upm[upm$pct.1 > crossconditionDE_min.pct,, drop=F]
    
    dnm <- m[m$logFC < 0,,drop=F]
    dnm <- dnm[dnm$pct.2 > crossconditionDE_min.pct,, drop=F]
    
    m <- rbind(upm,dnm)
    
    try( table( factor(sign(m$logFC), levels=c(-1,1)) ) )
  })
  
  numdegs <- t(numdegs)
  colnames(numdegs) <- c(c2, c1)
  
  #make sure all clusters are shown
  # make a fake df and replace fake with real res
  saved_cluster_levels <- paste0('cluster_',levels(sobjint$seurat_clusters))
  numdegs_all <- data.frame(Cluster = saved_cluster_levels,
                            c1 = 0, c2 = 0)
  colnames(numdegs_all) <- c('Cluster', c1, c2)
  rownames(numdegs_all) <- numdegs_all$Cluster
  
  numdegs_all[rownames(numdegs), c1] <- numdegs[,c1]
  numdegs_all[rownames(numdegs), c2] <- numdegs[,c2]
  rownames(numdegs_all) <- NULL
  
  
  
  ### at least one DEG must be present ###
  numdegs <- numdegs[ (numdegs[,1] > 0) | (numdegs[,2] > 0) , ] 
  
  
  #make sure all samples from pmd are in the mlist...
  subpmd <- sample_metadata[sample_metadata$Condition %in% c(c1,c2),]
  
  #for each cluster, get the up/dn DEGs
  i = 3
  clustres <- lapply(1:nrow(numdegs), function(i){
    
    clustlab <- rownames(numdegs)[i]
    m <- m_bycluster_crosscondition_de[[clustlab]]
    
    m <- m[m$FDR < crossconditionDE_padj_thres,, drop=F]
    m <- m[abs(m$logFC) > crossconditionDE_lfc_thres,, drop=F]
    
    crossconditionDE_min.pct <- ifelse(Pseudobulk_mode == T, yes = 0.1, no = 0)
    
    upm <- m[m$logFC > 0,,drop=F]
    upm <- upm[upm$pct.1 > crossconditionDE_min.pct,, drop=F]
    
    dnm <- m[m$logFC < 0,,drop=F]
    dnm <- dnm[dnm$pct.2 > crossconditionDE_min.pct,, drop=F]
    
    m <- rbind(upm,dnm)
    
    
    #make sure all samples are in top
    # only if de pseudobulk
    if(Pseudobulk_mode == T){
      
      if( any(!(subpmd$Code %in% colnames(m))) ){
        missingcodes <- subpmd$Code[!(subpmd$Code %in% colnames(m))]
        missingcodecols <- lapply(missingcodes, function(code){
          missingdf <- data.frame(missing = rep(0, nrow(m)) )
          colnames(missingdf) <- code
          return(missingdf)
        })
        
        missingcodecols <- dplyr::bind_cols(missingcodecols)
        
        
        
        #bind missing cols to results
        m <- cbind(m,missingcodecols)
        
      }
      
      #make sure order of samples is consistent
      m_nosamp <- m[,!(colnames(m) %in% subpmd$Code)]
      msamp <- m[,colnames(m) %in% subpmd$Code,drop=F]
      msamp <- msamp[, match(subpmd$Code, colnames(msamp)) ]
      
      m <- cbind(m_nosamp, msamp)
      
      
      
    }
    
    
    m$cluster <- clustlab
    
    return(m)
    
  })
  
  names(clustres) <- rownames(numdegs)
  
  # clustres <- do.call(rbind, clustres)
  
  
  
  
  
  
  
  
  
  
  
  # ### summary heatmap ###
  # 
  # # first make a summary heatmap, top 3 DEGs from each cluster up/dn...
  # 
  # top <- lapply(clustres, function(m){
  #   top <- head( m[m$logFC > 0,,drop=F] , n = 3 ) 
  #   btm <- tail( m[m$logFC < 0,,drop=F] , n = 3 )
  #   
  #   subm <- rbind(top,btm)
  #   
  #   return(subm)
  # })
  # 
  # top <- do.call(rbind, top)
  # rownames( top ) <- NULL
  # 
  # 
  # #to make things easier, remove "cluster_"
  # top$cluster <- gsub('cluster_', '', top$cluster)
  # 
  # 
  # 
  # genes <- top$gene_symbol
  # 
  # 
  # #subset for just these conditions
  # md <- sobjint@meta.data
  # md <- md[md$Condition %in% c(c1,c2),]
  # sobjsub <- sobjint[,rownames(md)]
  # 
  # #scale the relevant genes
  # # sobjsub <- ScaleData(sobjsub, features = genes, verbose = verbose)
  # 
  # 
  # # get the gene matrix
  # # gem <- sobjsub@assays$RISC@scale.data
  # gem <- sobjsub@assays$RISC@data
  # 
  # 
  # #prep heatmap
  # top <- top[top$gene %in% rownames(sobjsub),]
  # gem <- gem[match(genes, rownames(gem)),]
  # 
  # #as matrix
  # gem <- as.matrix(gem)
  # 
  # 
  # #annot for clusters
  # 
  # 
  # #first order gem by cluster...
  # md <- sobjsub@meta.data
  # md <- md[order(md$seurat_clusters),]
  # 
  # #within cluster, order by condition...
  # mdx <- lapply( unique(md$seurat_clusters) , function(clust){
  #   mdsub <- md[md$seurat_clusters==clust,]
  #   
  #   mdc1 <- mdsub[mdsub$Condition == c1,,drop=F]
  #   mdc2 <- mdsub[mdsub$Condition == c2,,drop=F]
  #   
  #   mdsub <- rbind(mdc1,mdc2)
  #   return(mdsub)
  #   
  # })
  # md <- do.call(rbind, mdx)
  # 
  # #match gem order too
  # gem <- gem[,match(rownames(md), colnames(gem))]
  # 
  # #cluster annot and colors
  # clust_bc <- setNames(md$seurat_clusters,
  #                      nm = colnames(gem)
  # )
  # col_clust <- setNames(scales::hue_pal()(length(levels(sobjsub$seurat_clusters))),
  #                       nm = levels(sobjsub$seurat_clusters))
  # 
  # #condition annot and colors
  # cond_bc <- setNames(md$Condition,
  #                     nm = colnames(gem))
  # 
  # #for cond colors, set up color scheme...
  # set2 <- c("#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3")
  # 
  # 
  # set2 <- set2[1:length(unique(sample_metadata$Condition))]
  # 
  # col_cond <- setNames(set2,
  #                      nm = unique(sample_metadata$Condition) )
  # 
  # col_cond <- col_cond[names(col_cond) %in% c(c1,c2)]
  # 
  # 
  # 
  # ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = clust_bc, 
  #                                               Condition = cond_bc,
  #                                               
  #                                               col = list(Cluster = col_clust,
  #                                                          Condition = col_cond),
  #                                               show_legend = F)
  # 
  # 
  # 
  # #annot for markers
  # #set genes according to ct_ordered
  # top$cluster <- factor(top$cluster, levels=unique(top$cluster))
  # 
  # #condition; can just use +/- lfc
  # top$Condition <- c1
  # top[top$logFC < 0, 'Condition'] <- c2
  # 
  # 
  # #match order of genes of gem with stats res
  # gem <- gem[match(top$gene, rownames(gem)),]
  # 
  # 
  # #set up annot for cluster for genes
  # ct_gene <- setNames(top$cluster,
  #                     nm=top$gene_symbol)
  # col_gene <- col_clust
  # # col_gene <- col_gene[names(col_gene) %in% top$cluster]
  # 
  # #set up annot for condition for genes
  # cond_gene <- setNames(top$Condition,
  #                       nm = top$gene_symbol)
  # col_genecond <- col_cond
  # 
  # 
  # ha_genes <- ComplexHeatmap::rowAnnotation(Cluster = ct_gene,
  #                                           Condition = cond_gene,
  #                                           col = list(Cluster = col_gene, Condition = col_genecond),
  #                                           show_annotation_name=F)
  # 
  # 
  # #restrict range
  # # gem[gem>5] <- 5
  # # gem[gem<-5] <- -5
  # gem <- log1p(gem)
  # 
  # pal <- circlize::colorRamp2(c(0,  max(gem)), c("white", "red"))
  # 
  # #actual heatmap
  # hm_DE <- ComplexHeatmap::Heatmap(gem,
  #                                  # column_title = 'Integrated clusters',
  #                                  column_labels = rep('', ncol(gem)),
  #                                  row_names_gp = grid::gpar(fontsize = 5),
  #                                  column_split = md$seurat_clusters,
  #                                  row_split = top$cluster,
  #                                  row_title_gp = grid::gpar(fontsize = 5),
  #                                  row_gap = unit(0.8, "mm"), 
  #                                  column_gap = unit(0.8, "mm"),
  #                                  border_gp = gpar(col = "black", lwd = 0.1),
  #                                  row_title_rot = 0,
  #                                  column_title_rot = 45,
  #                                  column_title_gp = grid::gpar(fontsize = 7), 
  #                                  col = pal,
  #                                  name = 'Log1p\nRISC Norm\nCounts',
  #                                  cluster_columns = F,
  #                                  cluster_rows = F,
  #                                  top_annotation = ha_clust,
  #                                  left_annotation = ha_genes,
  #                                  use_raster = F)
  # 
  # 
  # 
  # 
  # 
  # 
  # 
  # 
  # 
  # ### same thing but avg
  # # ie, avg clust 1 c1, avg clust 1 c2, avg clust 2 c1, avg clust 2 c2, etc...
  # 
  # 
  # md <- sobjsub@meta.data
  # md$condclust <- paste0( md$seurat_clusters, '_c1')
  # md[md$Condition == c2,'condclust'] <- paste0( md[md$Condition == c2,'seurat_clusters'], '_c2')
  # oldlevs <- str_sort(unique(md$condclust), numeric = T)
  # levs <- oldlevs
  # levs <- gsub('c1', c1, levs)
  # levs <- gsub('c2', c2, levs)
  # md$condclust <- factor(md$condclust, levels = oldlevs)
  # md$condclust <- plyr::mapvalues(md$condclust, oldlevs, levs)
  # 
  # 
  # sobjsub$condclust <- md$condclust
  # avg <- AverageExpression(sobjsub, assays = 'RISC', slot = 'data', 
  #                          features = rownames(gem),
  #                          group.by = 'condclust')
  # 
  # 
  # avg <- as.matrix(avg[[1]])
  # 
  # ## set up avg heatmap, annots etc ##
  # 
  # #column annots
  # #cluster annot and colors
  # AVGclust_bc <- setNames(str_split_fixed(colnames(avg), '_', 2)[,1],
  #                         nm = colnames(avg)
  # )
  # AVGcol_clust <- setNames(scales::hue_pal()(length(levels(sobjsub$seurat_clusters))),
  #                          nm = levels(sobjsub$seurat_clusters))
  # 
  # #condition annot and colors
  # AVGcond_bc <- setNames(str_split_fixed(colnames(avg), '_', 2)[,2],
  #                        nm = colnames(avg))
  # 
  # #for cond colors, set up color scheme...
  # set2 <- c("#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#E5C494", "#B3B3B3")
  # 
  # set2 <- set2[1:length(unique(sample_metadata$Condition))]
  # 
  # col_cond <- setNames(set2,
  #                      nm = unique(sample_metadata$Condition) )
  # 
  # col_cond <- col_cond[names(col_cond) %in% c(c1,c2)]
  # 
  # 
  # 
  # ha_clust <- ComplexHeatmap::HeatmapAnnotation(Cluster = AVGclust_bc, 
  #                                               Condition = AVGcond_bc,
  #                                               
  #                                               col = list(Cluster = AVGcol_clust,
  #                                                          Condition = col_cond),
  #                                               show_legend = F)
  # 
  # 
  # 
  # 
  # #annot for markers
  # #set genes according to ct_ordered
  # # top$cluster <- factor(top$cluster, levels=unique(top$cluster))
  # 
  # #condition; can just use +/- lfc
  # # top$Condition <- c1
  # # top[top$logFC < 0, 'Condition'] <- c2
  # 
  # 
  # #match order of genes of gem with stats res
  # avg <- avg[match(top$gene, rownames(avg)),]
  # 
  # 
  # # #set up annot for cluster for genes
  # # ct_gene <- setNames(top$cluster,
  # #                     nm=top$gene_symbol)
  # # col_gene <- col_clust
  # # # col_gene <- col_gene[names(col_gene) %in% top$cluster]
  # # 
  # # #set up annot for condition for genes
  # # cond_gene <- setNames(top$Condition,
  # #                       nm = top$gene_symbol)
  # # col_genecond <- col_cond
  # # 
  # # 
  # # ha_genes <- ComplexHeatmap::rowAnnotation(Cluster = ct_gene,
  # #                                           Condition = cond_gene,
  # #                                           col = list(Cluster = col_gene, Condition = col_genecond),
  # #                                           show_annotation_name=F)
  # 
  # 
  # #restrict range
  # # gem[gem>5] <- 5
  # # gem[gem<-5] <- -5
  # 
  # avg <- log1p(avg)
  # 
  # pal <- circlize::colorRamp2(c(0,  max(avg)), c("white", "red"))
  # 
  # #make column order not alphabetic...
  # colsplit <- colnames(avg)
  # colsplit <- as.character(colsplit)
  # colsplit <- str_split_fixed(colsplit, '_', 2)[,1]
  # colsplit <- factor( colsplit,
  #                     levels = str_sort(unique(colsplit), numeric = T))
  # 
  # #actual heatmap
  # hm_DE_AVG <- ComplexHeatmap::Heatmap(avg,
  #                                      # column_title = 'Integrated clusters',
  #                                      # column_labels = rep('', ncol(gem)),
  #                                      column_names_gp = grid::gpar(fontsize = 5),
  #                                      column_names_rot = 45,
  #                                      row_names_gp = grid::gpar(fontsize = 5),
  #                                      column_split = colsplit,
  #                                      row_split = top$cluster,
  #                                      row_title_gp = grid::gpar(fontsize = 5),
  #                                      row_gap = unit(0.8, "mm"), 
  #                                      column_gap = unit(0.8, "mm"),
  #                                      border_gp = gpar(col = "black", lwd = 0.1),
  #                                      row_title_rot = 0,
  #                                      column_title_rot = 45,
  #                                      column_title_gp = grid::gpar(fontsize = 7), 
  #                                      col = pal,
  #                                      name = 'Log1p\nRISC Norm\nCounts',
  #                                      cluster_columns = F,
  #                                      cluster_rows = F,
  #                                      top_annotation = ha_clust,
  #                                      left_annotation = ha_genes,
  #                                      use_raster = F)
  # 
  # 
  # 
  # 
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  ### for each cluster, make a heatmap of the DEGs using single-cell data
  i=3
  cluster_de_heatmap_l <- lapply(1:length(clustres), function(i){
    
    
    #get DEGs
    m <- clustres[[i]]
    
    #sort degs
    m <- m[order(m$logFC, decreasing = T),]
    
    #get cluster name
    cluster <- names(clustres)[i]
    
    message(cluster)
    
    #get cluster without "cluster_" prefix
    cluster_noprefix <- gsub('cluster_', '', cluster)
    
    #get cluster cells; order by c1 then c2
    md <- sobjint@meta.data
    mdclust <- md[md$seurat_clusters == cluster_noprefix,]
    mdclust <- mdclust[mdclust$Condition %in% c(c1,c2),]
    mdclust$Condition <- factor(mdclust$Condition, levels = c(c1,c2))
    mdclust <- mdclust[order(mdclust$Condition), ]
    
    clustcells <- rownames(mdclust)
    
    #get matrix of DEGs of cells from this cluster
    mat <- sobjint@assays$RISC@data
    mat <- mat[match(m$gene_symbol, rownames(mat)),clustcells,drop=F]
    
    
    #scale matrix
    mat <- as.matrix(mat)
    mat <- t(scale(t(mat)))
    
    #restrict range
    mat[mat > 5 ] <- 5
    mat[mat < -5] <- -5
    
    hm_de_clust <- Heatmap(mat, 
                           show_column_names = F, 
                           cluster_column_slices = F, cluster_columns = F,
                           cluster_row_slices = F, cluster_rows = F,
                           show_row_names = ifelse(nrow(mat)<=50, yes = T, no = F), 
                           name = paste0('Scaled\nRISC\nValues'), 
                           column_split = mdclust$Condition, 
                           row_split = factor(sign(m$logFC), levels = c(1,-1)),
                           border_gp = gpar(col = "black", lwd = 1)
                           
    )
    
    pdf(NULL)
    hm_de_clust <- invisible(
      draw(hm_de_clust, 
           column_title = paste0(lab, '\n', str_to_title(cluster) ),
           row_title = 'Log Fold Change Sign')
    )
    dev.off()
    
    
    
    if(Pseudobulk_mode == T){
      ### just plot the values
      pb_mat <- m[,colnames(m) %in% mdclust$Code]
      
      
      
      pb_mat <- as.matrix(pb_mat)
      pb_mat <- log1p(pb_mat)
      pb_mat <- t(scale(t(pb_mat)))
      
      #restrict range
      pb_mat[pb_mat > 5 ] <- 5
      pb_mat[pb_mat < -5] <- -5
      
      
      #order md properly
      #get just this comp samples
      subpmd <- sample_metadata[sample_metadata$Condition %in% c(c1,c2),]
      
      
      
      #order columns of heatmap by c1 vs c2
      code_comps_order <- subpmd[subpmd$Condition == c1,"Code"]
      code_comps_order <- c(code_comps_order, subpmd[subpmd$Condition == c2,"Code"])
      
      #make sure they are in the matrix
      code_comps_order <- code_comps_order[code_comps_order %in% colnames(pb_mat)]
      
      #also prepare factor for ordering of heatmap
      condition_vector_ordering <- factor(subpmd[match(code_comps_order, subpmd$Code), "Condition"], levels = c(c1,c2))
      
      
      pb_mat <- pb_mat[,code_comps_order,drop=F]
      
      
      hm_de_clust_pb <- Heatmap(pb_mat, 
                                show_column_names = T, 
                                cluster_column_slices = F, cluster_columns = F,
                                cluster_row_slices = F, cluster_rows = F,
                                show_row_names = ifelse(nrow(mat)<=50, yes = T, no = F), 
                                name = paste0('Scaled\nPseudobulk\nValues'),
                                column_split = condition_vector_ordering, 
                                row_split = factor(sign(m$logFC), levels = c(1,-1)),
                                border_gp = gpar(col = "black", lwd = 1)
      )
      
      
      pdf(NULL)
      hm_de_clust_pb <- invisible(
        draw(hm_de_clust_pb, 
             column_title = paste0(lab, '\n', str_to_title(cluster) ),
             row_title = 'Log Fold Change Sign')
      )
      dev.off()
      
      
    }
    
    if(Pseudobulk_mode == F){
      
      #get DEGs
      m <- clustres[[i]]
      
      #sort degs
      m <- m[order(m$logFC, decreasing = T),]
      
      #get cluster name
      cluster <- names(clustres)[i]
      
      #get cluster without "cluster_" prefix
      cluster_noprefix <- gsub('cluster_', '', cluster)
      
      #get cluster cells; order by c1 then c2
      md <- sobjint@meta.data
      mdclust <- md[md$seurat_clusters == cluster_noprefix,]
      mdclust$Condition <- factor(mdclust$Condition, levels = c(c1,c2))
      mdclust <- mdclust[order(mdclust$Condition), ]
      
      clustcells <- rownames(mdclust)
      
      #get matrix of DEGs of cells from this cluster
      mat <- sobjint@assays$RISC@data
      mat <- mat[match(m$gene_symbol, rownames(mat)),clustcells,drop=F]
      
      #order md properly
      #get just this comp samples
      subpmd <- sample_metadata[sample_metadata$Condition %in% c(c1,c2),]
      
      #order columns of heatmap by c1 vs c2
      code_comps_order <- subpmd[subpmd$Condition == c1,"Code"]
      code_comps_order <- c(code_comps_order, subpmd[subpmd$Condition == c2,"Code"])
      
      #make sure sample is even present in cluster
      code_comps_order <- code_comps_order[code_comps_order %in% mdclust$Code]
      
      #also prepare factor for ordering of heatmap
      condition_vector_ordering <- factor(subpmd[match(code_comps_order, subpmd$Code), "Condition"], levels = c(c1,c2))
      
      
      avg_l <- lapply(code_comps_order, function(code){
        codecells <- rownames(mdclust[mdclust$Code == code,])
        avgmat <- Matrix::rowMeans(mat[,codecells, drop=F])
        
        avgmat <- matrix(avgmat)
        rownames(avgmat) <- rownames(mat); colnames(avgmat) <- code
        return(avgmat)
      })
      avgmat <- as.matrix(dplyr::bind_cols(avg_l))
      rownames(avgmat) <- rownames(mat)
      colnames(avgmat) <- code_comps_order
      
      avgmat <- as.matrix(avgmat)
      avgmat <- t(scale(t(avgmat)))
      
      #restrict range
      avgmat[avgmat > 5 ] <- 5
      avgmat[avgmat < -5] <- -5
      
      hm_de_clust_pb <- Heatmap(avgmat, 
                                show_column_names = T, 
                                cluster_column_slices = F, cluster_columns = F,
                                cluster_row_slices = F, cluster_rows = F,
                                show_row_names = ifelse(nrow(mat)<=50, yes = T, no = F), 
                                name = paste0('Scaled\nMean\nRisc\nValues'),
                                column_split = condition_vector_ordering, 
                                row_split = factor(sign(m$logFC), levels = c(1,-1)),
                                border_gp = gpar(col = "black", lwd = 1)
      )
      
      
      pdf(NULL)
      hm_de_clust_pb <- invisible(
        draw(hm_de_clust_pb, 
             column_title = paste0(lab, '\n', str_to_title(cluster) ),
             row_title = 'Log Fold Change Sign')
      )
      dev.off()
      
      
      
      
    }
    
    
    return(list(hm_de_clust = hm_de_clust,
                hm_de_clust_pb = hm_de_clust_pb))
    
  })
  
  
  
  names(cluster_de_heatmap_l) <- names(clustres)
  
  
  deplotlist <- list(#hm_DE = hm_DE,
    #hm_DE_AVG = hm_DE_AVG,
    cluster_de_heatmap_l = cluster_de_heatmap_l)
  
  
  
  
  deplotfile <- paste0(outdir_DE_plots, lab, '.pdf')
  
  pdf(deplotfile, height = 9, width = 9)
  
  # print(deplotlist$hm_DE)
  # print(deplotlist$hm_DE_AVG)
  invisible(print(deplotlist$cluster_de_heatmap_l))
  
  dev.off()
  
  return(deplotlist)
  
  
  
})




names(DEplots_comps) <- comps$labels
### save cluster levels ###
saved_cluster_levels <- levels(sobjint$seurat_clusters)

pwayoutdir <- paste0(outdir_int, '/pathwayanalysis_crosscondition/')
dir.create(pwayoutdir, recursive = T)

#at this point, we no longer need many of the high memory using objects
rm(sobjint, 
   ap_cond_to_clust, ap_int_celltypes, ap_samp_to_clust, avg, avgl, ctplots_int,
   cutoffplots, cutoffs,d1_int, d2_int, d3_int,dcond,dcond_split,dend,dp_ctscores,
   gem,ha_clust,ha_genes,hm_ctscores,hm_int, hm_refmarkers, hm_refmarkers_avg,
   intmd, labelsdf, m_integrated_clusters,m_ref_small, m_reference, md,mdint,nicedend,
   sampsumplots, summaryplots_integrated, summaryplots_integrated_ct, thisct_plots, top,
   d4_int, ip)

invisible(gc(full = T, reset = F, verbose = F))


#prep pathways
pathways <- preppathways_pathwayanalysis_crosscondition_module(species = species,
                                                               outdir_int = outdir_int)

### run main pathway analysis ###
pways_output_list <- pathwayanalysis_crosscondition_module(
  m_bycluster_crosscondition_de_comps = m_bycluster_crosscondition_de_comps,
  pathways = pathways,
  sample_metadata = sample_metadata,
  # deg.weight = "pval", #this is deprecated now
  comps = comps,
  workernum = workernum,
  outdir_int = outdir_int
)


pathway_analysis_mainlist_comps <- pways_output_list$pathway_analysis_mainlist_comps
pathwaysummplots_comps <- pways_output_list$pathwaysummplots_comps



### for easily reproducing plots and etc, save them as R objects...

DE_pathways_plot_objects_list <- list(comps = comps,
                                      m_bycluster_crosscondition_de_comps = m_bycluster_crosscondition_de_comps,
                                      pathway_analysis_mainlist_comps = pathway_analysis_mainlist_comps,
                                      pathwaysummplots_comps = pathwaysummplots_comps,
                                      crossconditionDE_padj_thres = crossconditionDE_padj_thres,
                                      crossconditionDE_lfc_thres = crossconditionDE_lfc_thres)




DE_pathways_plot_objects_list_file <- paste0(pwayoutdir, '/DE_pathways_plot_objects_list.rds')

saveRDS(DE_pathways_plot_objects_list, DE_pathways_plot_objects_list_file)
#### print out NUMDEGS, NUMDEGS THRESHOLDED, AND PWAY ANALYSIS TO HTML


#do this for each comparison


compslen <- nrow(comps)

for(compidx in c(1:compslen) ){
  
  
  #get comparison condition levels
  c1 <- comps[compidx,1]
  c2 <- comps[compidx,2]
  
  #get comp lab
  lab <- comps[compidx,3]
  
  #get cross conditions res per cluster list
  m_bycluster_crosscondition_de <- m_bycluster_crosscondition_de_comps[[compidx]]
  
  #get pway results full list
  pathway_analysis_mainlist <- pathway_analysis_mainlist_comps[[compidx]]
  
  #get pway summary plots
  summplots_cats <- pathwaysummplots_comps[[compidx]]
  
  
  
  # get the deplots
  deplotlist <- DEplots_comps[[lab]]
  
  
  complab <- "


## %s


"
  
  cat(sprintf(complab, lab))
  
  
  
  
  
  if(Pseudobulk_mode == T){
    
    plotlab <- "


### Number of significant DEGs across conditions in each cluster

Here we check number of differentially expressed genes (DEGs) after applying some statistical thresholds:
    
* Adjusted P value < %s

* Log Fold Change > +/- %s

* Proportion of cells expressing gene in condition A > 0.1 if LFC is positive

* Proportion of cells expressing gene in condition B > 0.1 if LFC is negative

Differential expression compares genes in condition A (left) versus condition B (right). The left column indicates the number of genes upregulated in the left condition, while the right column indicates genes upregulated in the right condition.

Finally, the thresholds above do not affect downstream results from pathway analysis, they are just meant to count the number of DEGs. Downstream usage of DEGs can use these thresholds, or you can choose other appropriate cutoffs.

"
    
    cat( sprintf(plotlab, crossconditionDE_padj_thres, crossconditionDE_lfc_thres) )
    
    
    numdegs <- sapply(m_bycluster_crosscondition_de, function(m){
      
      #normal fdr and padj thresholds
      m <- m[m$FDR < crossconditionDE_padj_thres,, drop=F]
      m <- m[abs(m$logFC) > crossconditionDE_lfc_thres,, drop=F]
      
      #pct thresholds: +FC, pct1 > 0.1; -FC, pct2 > 0.1
      
      upm <- m[m$logFC > 0,,drop=F]
      upm <- upm[upm$pct.1 > 0.1,, drop=F]
      
      dnm <- m[m$logFC < 0,,drop=F]
      dnm <- dnm[dnm$pct.2 > 0.1,, drop=F]
      
      m <- rbind(upm,dnm)
      
      try( table( factor(sign(m$logFC), levels=c(-1,1)) ) )
    })
    
    numdegs <- t(numdegs)
    colnames(numdegs) <- c(c2, c1)
    
    #make sure all clusters are shown
    # make a fake df and replace fake with real res
    saved_cluster_levels_withlab <- paste0('cluster_', saved_cluster_levels)
    numdegs_all <- data.frame(Cluster = saved_cluster_levels_withlab,
                              c1 = 0, c2 = 0)
    colnames(numdegs_all) <- c('Cluster', c1, c2)
    rownames(numdegs_all) <- numdegs_all$Cluster
    
    numdegs_all[rownames(numdegs), c1] <- numdegs[,c1]
    numdegs_all[rownames(numdegs), c2] <- numdegs[,c2]
    rownames(numdegs_all) <- NULL
    
    #rename colnames to have high
    colnames(numdegs_all) <- c('Cluster', paste0(c1, '_high'), paste0(c2, '_high'))
    
    print(knitr::kable(numdegs_all))
    
  }
  
  if(Pseudobulk_mode == F){
    
    
    plotlab <- "


### Number of significant DEGs across conditions in each cluster

Here we check number of differentially expressed genes (DEGs) after applying some statistical thresholds:
    
* Adjusted P value < %s

* Log Fold Change > +/- %s

Differential expression compares genes in condition A (left) versus condition B (right). The left column indicates the number of genes upregulated in the left condition, while the right column indicates genes upregulated in the right condition.

Finally, the thresholds above do not affect downstream results from pathway analysis, they are just meant to count the number of DEGs. Downstream usage of DEGs can use these thresholds, or you can choose other appropriate cutoffs.

"
    
    cat( sprintf(plotlab, crossconditionDE_padj_thres, crossconditionDE_lfc_thres) )
    
    
    
    numdegs <- sapply(m_bycluster_crosscondition_de, function(m){
      
      #normal fdr and padj thresholds
      m <- m[m$FDR < crossconditionDE_padj_thres,, drop=F]
      m <- m[abs(m$logFC) > crossconditionDE_lfc_thres,, drop=F]
      
      #pct thresholds for pb_edgeR only: +FC, pct1 > 0.1; -FC, pct2 > 0.1
      
      upm <- m[m$logFC > 0,,drop=F]
      
      dnm <- m[m$logFC < 0,,drop=F]
      
      m <- rbind(upm,dnm)
      
      try( table( factor(sign(m$logFC), levels=c(-1,1)) ) )
    })
    
    numdegs <- t(numdegs)
    colnames(numdegs) <- c(c2, c1)
    
    #make sure all clusters are shown
    # make a fake df and replace fake with real res
    saved_cluster_levels_withlab <- paste0('cluster_', saved_cluster_levels)
    numdegs_all <- data.frame(Cluster = saved_cluster_levels_withlab,
                              c1 = 0, c2 = 0)
    colnames(numdegs_all) <- c('Cluster', c1, c2)
    rownames(numdegs_all) <- numdegs_all$Cluster
    
    numdegs_all[rownames(numdegs), c1] <- numdegs[,c1]
    numdegs_all[rownames(numdegs), c2] <- numdegs[,c2]
    rownames(numdegs_all) <- NULL
    
    #rename colnames to have high
    colnames(numdegs_all) <- c('Cluster', paste0(c1, '_high'), paste0(c2, '_high'))
    
    print(knitr::kable(numdegs_all))
    
    
    
    
  }
  
  
  
  
  
  #   plotlab <- '
  # ### Heatmap of top cross-condition DEGs for each cluster
  # 
  # Here we plot the top 3 differentially expressed genes across conditions from each cluster.
  # In other words, in cluster 1, we have the top 3 DEGs from %s and the top 3 DEGs from %s, then the same in cluster 2, so on.
  # 
  # 
  # 
  # '
  #   
  #   cat(sprintf(plotlab, c1, c2))
  #   
  #   print(deplotlist$hm_DE)
  
  
  #   ### this is kind of bugged, it prints the plot without making the title a heading...
  #   cat('\n\n\n')
  #   
  #   plotlab <- '
  # ### Heatmap of top cross-condition DEGs for after averaging each cluster
  # 
  # Here we plot the top 3 differentially expressed genes across conditions from each cluster.
  # This heatmap is similar to the one above, but rather than showing all the cells from each cluster, we show the cluster averages.
  # In other words, in cluster 1, we have the top 3 DEGs from %s and the top 3 DEGs from %s, then the same in cluster 2, so on.
  # 
  # 
  # 
  # '
  #   
  #   cat(sprintf(plotlab, c1, c2))
  #   
  #   print(deplotlist$hm_DE_AVG)
  
  
  
  
  
  
  
  cat('\n\n\n')
  
  plotlab <- '
### Per-cluster heatmaps of all DEGs

To ensure the differential expression results are robust, it is helpful to inspect all DEGs by visualizing them in a heatmap. For each cluster, we compare the DEGs at both the single cell level, and either the pseudobulk level (if we used pseudobulk_edgeR) or the average RISC value level (if using wilcox).



'
  
  cat(sprintf(plotlab))
  
  cluster_de_heatmap_l <- deplotlist$cluster_de_heatmap_l
  
  
  for(i in 1:length(cluster_de_heatmap_l) ){
    clust = names(cluster_de_heatmap_l)[i]
    clust = str_to_title(clust)
    
    clustplots <- cluster_de_heatmap_l[[i]]
    
    
    plotlab <- '
#### %s


'
    
    cat(sprintf(plotlab, clust))
    
    
    plotlab <- '
##### Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.


'
    
    cat(plotlab)
    
    
    
    print(clustplots[[1]])
    
    
    cat('\n\n\n')
    
    
    if(Pseudobulk_mode == T){
      plotlab <- '
##### Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.


'
    }
    
    
    if(Pseudobulk_mode == F){
      plotlab <- '
##### Differentially Expressed Gene Heatmap plotted after averaging RISC values for samples for this cluster

Here, we plot all significant DEGs after averaging the RISC-transformed values for samples in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.


'
    }
    
    
    
    
    cat(plotlab)
    
    
    print(clustplots[[2]])
    
    
    
    cat('\n\n\n')
    
    
  }
  
  rm(cluster_de_heatmap_l, clustplots)
  
  
  
  
  
  
  
  
  
  pwayDElab <- "


### Gene set enrichment analysis across conditions

Pathway analysis for the cross-condition analysis is performed for overexpressed and underexpressed genes for each cluster. This is done via Gene Set Enrichment Analysis (GSEA) [(Subramanian et al 2005)](https://www.pnas.org/doi/10.1073/pnas.0506580102).

GSEA is preferred over other pathway analysis such as fisher tests or chi-square tests because it does not require making arbitrary cutoffs to the number of DEGs and takes into account how strongly differentially expressed each gene may be. For the latter, data-driven gene-specific weight is applied. We use a standard weighting method of -log10(P-value) * sign of Log Fold Change.

The pathways we choose in pathway analysis are derived from the Molecular Signatures Database (MSIGDB) where they are sorted by categories, such as Gene Ontology (GO) Biological Process, GO Molecular Function, KEGG, Reactome, etc. These are databases that annotate genes by function or molecular pathway.


"
  
  cat(pwayDElab)
  
  
  
  
  
  
  # print the pathway analysis
  
  
  pathway_analysis_main_nonull <- pathway_analysis_mainlist[lengths(pathway_analysis_mainlist) != 0]
  
  
  #loop through each category:
  # loop thru each cluster
  # get up/down pathways if there are any in each cluster
  # make some adjustments to the table to adjust for printing: shorten gene list
  # print for whole category:
  # 1. summary of whole category, up pathways 
  # 2. summary of whole category, dn pathways
  # 3. cluster by cluster, up tables, up dotplots
  # 4. cluster by cluster, dn tables, dn dotplots
  
  for(i in 1:length(pathway_analysis_main_nonull) ){
    
    #get category name
    cat <- names(pathway_analysis_main_nonull)[i]
    
    
    #get actual cluster results, and remove NAs...
    pwaycat <- pathway_analysis_main_nonull[[i]]
    
    
    # remove clusters with no pathways --> this sets them to list of lengths 0
    pwaycat = lapply(pwaycat, function(clust){clust[lengths(clust) != 0]})
    
    # remove the clusters with no pathways by removing the lists of length 0
    pwaycat <- pwaycat[lengths(pwaycat) != 0]
    
    
    ### if the category has NO PATHWAYS SIGNIFICANT in ANY cluster, just skip it
    if(length(pwaycat) == 0){
      next
    }
    
    
    
    #in each category, loop thru each cluster
    
    #loop through each cluster's results; this list contains up/down for each pathway
    # j is the cluster index
    #for(j in 1:length(pwaycat) ){
    clustlist <- lapply( c( 1:length(pwaycat) ), function(j){
      
      
      #get cluster name
      clust <- names(pwaycat)[j]
      
      
      
      #get this clusters up/down list, and remove if null
      clustcat <- pwaycat[[j]]
      clustcat <- clustcat[lengths(clustcat) != 0]
      
      if(length(clustcat)==0){
        return()
      }
      
      
      
      #get the results for this cluster
      dirreslist <- clustcat
      
      #get the plot
      dp <- dirreslist$dp
      
      #get the table
      gseares <- dirreslist$gseares
      
      
      
      
      
      # use the dotplot data to subset only significant
      gseares_fromplot <- dp$data
      
      #fix up so we can match the two
      gseares_fromplot$Description <- gseares_fromplot$pathway
      gseares_fromplot$Description <- gsub(' ', replacement = '_', gseares_fromplot$Description)
      gseares_fromplot$Description <- gsub('\n', replacement = '_', gseares_fromplot$Description)
      
      gseares <- gseares[match(gseares_fromplot$Description, gseares$pathway),]
      
      
      #modify cpres... keep only important columns...
      cpshow <- gseares
      rownames(cpshow) <- NULL
      
      cpshow <- cpshow[,c('pathway', "NES", "ES", "pval", "padj", "log2err", "size", "leadingEdge")]
      
      #keep only top 5 leading edge genes
      top5gene <- sapply(cpshow$leadingEdge, function(x){
        x <- x[x!='']
        
        if(length(x) > 5){
          x <- head(x,5)
          x[6] <- '...'
        }
        
        paste(x, collapse = '/')
      })
      
      
      cpshow$leadingEdge <- top5gene
      
      
      #instead of printing, output a list of them
      
      return(
        list(cpshow=cpshow,
             dp=dp)
      )
      
      
      ## return just the cpshow
      #return(cpshow)
      
      
      
      
    }) #close cluster lapply
    
    
    names(clustlist) <- names(pwaycat)
    
    
    
    #get the summaryplots
    # i is the category index
    summplots_conds <- summplots_cats[[i]]
    
    
    
    
    
    
    
    # print for whole category:
    # 1. summary dotplot of whole category, up pathways 
    # 2. summary dotplot of whole category, dn pathways
    # 3. cluster by cluster, up/dn table and dotplots
    
    #print category label
    catlab <- "


#### %s


"
    
    #prep cluster label, will print this for each cluster
    cat(sprintf(catlab, cat))
    
    
    
    
    
    for(condidx in 1:length(c(c1,c2)) ){
      
      cond <- c(c1,c2)[condidx]
      
      
      summplot <- summplots_conds[[condidx]]
      
      
      
      
      if( is.null(summplot) ){
        
        summlab <- "


##### Summary %s, no pathways enriched
    
This category of pathways had no signifcantly enriched pathways in %s


"
        
        cat(sprintf(summlab, cond, cond))
        
        next
        
      }
      
      
      
      summlab <- "


##### Summaryplot: %s
    
Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in %s. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.


"
      
      cat(sprintf(summlab, cond, cond))
      
      print(summplot)
      
      
      
      
      
    }
    
    
    
    
    
    
    clustlab <- "


##### Per-cluster pathway results
      
Here we plot the pathways that are significantly enriched in the differentially expressed genes between %s and %s for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in %s relative to %s, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.



"
    
    cat(sprintf(clustlab, c1, c2, c1, c2))
    
    
    #for each cluster, print out the dotplot and table,
    # or a print a message saying none significant
    for(clust in names(clustlist) ){
      
      
      
      
      
      clust_cond_lab <- "





###### %s


"
      
      cat(sprintf(clust_cond_lab, clust))
      
      
      
      clust_plot_tab <- clustlist[[clust]]
      
      
      
      cpshow <- clust_plot_tab$cpshow
      dp <- clust_plot_tab$dp
      
      
      
      print( knitr::kable(cpshow) )
      print(dp)
      
      
      
      
      
    } #close cluster printing loop
    
    
  } # close category by category for loop
  
  
  
} # close comparisons loop

11.1 Covid_Critical_vs_Healthy

11.1.1 Number of significant DEGs across conditions in each cluster

Here we check number of differentially expressed genes (DEGs) after applying some statistical thresholds:

  • Adjusted P value < 0.1

  • Log Fold Change > +/- 0

  • Proportion of cells expressing gene in condition A > 0.1 if LFC is positive

  • Proportion of cells expressing gene in condition B > 0.1 if LFC is negative

Differential expression compares genes in condition A (left) versus condition B (right). The left column indicates the number of genes upregulated in the left condition, while the right column indicates genes upregulated in the right condition.

Finally, the thresholds above do not affect downstream results from pathway analysis, they are just meant to count the number of DEGs. Downstream usage of DEGs can use these thresholds, or you can choose other appropriate cutoffs.

Cluster Covid_Critical_high Healthy_high
cluster_1 306 133
cluster_2 63 16
cluster_3 69 26
cluster_4 389 192
cluster_5 0 0
cluster_6 36 41
cluster_7 0 0

11.1.2 Per-cluster heatmaps of all DEGs

To ensure the differential expression results are robust, it is helpful to inspect all DEGs by visualizing them in a heatmap. For each cluster, we compare the DEGs at both the single cell level, and either the pseudobulk level (if we used pseudobulk_edgeR) or the average RISC value level (if using wilcox).

11.1.2.1 Cluster_1

11.1.2.1.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.1.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.2 Cluster_2

11.1.2.2.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.2.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.3 Cluster_3

11.1.2.3.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.3.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.4 Cluster_4

11.1.2.4.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.4.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.5 Cluster_6

11.1.2.5.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.2.5.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.1.3 Gene set enrichment analysis across conditions

Pathway analysis for the cross-condition analysis is performed for overexpressed and underexpressed genes for each cluster. This is done via Gene Set Enrichment Analysis (GSEA) (Subramanian et al 2005).

GSEA is preferred over other pathway analysis such as fisher tests or chi-square tests because it does not require making arbitrary cutoffs to the number of DEGs and takes into account how strongly differentially expressed each gene may be. For the latter, data-driven gene-specific weight is applied. We use a standard weighting method of -log10(P-value) * sign of Log Fold Change.

The pathways we choose in pathway analysis are derived from the Molecular Signatures Database (MSIGDB) where they are sorted by categories, such as Gene Ontology (GO) Biological Process, GO Molecular Function, KEGG, Reactome, etc. These are databases that annotate genes by function or molecular pathway.

11.1.3.1 HALLMARK

11.1.3.1.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.1.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.1.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.1.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_E2F_TARGETS 2.728753 0.8201163 0.0000000 0.0000000 1.5565442 192 TK1/STMN1/MKI67/CDC20/BIRC5/…
HALLMARK_G2M_CHECKPOINT 2.637151 0.7966623 0.0000000 0.0000000 1.3877461 172 STMN1/MKI67/UBE2C/CDC20/BIRC5/…
HALLMARK_MYC_TARGETS_V1 2.090822 0.6294245 0.0000000 0.0000000 0.8140358 190 TYMS/CDC20/MCM4/DUT/PCNA/…
HALLMARK_MITOTIC_SPINDLE 2.013879 0.6113177 0.0000000 0.0000004 0.7195128 165 BIRC5/TOP2A/CENPF/TPX2/PLK1/…
HALLMARK_ESTROGEN_RESPONSE_LATE 1.950394 0.6695374 0.0000248 0.0002477 0.5756103 70 AREG/PRSS23/CISH/ZFP36/FABP5/…
HALLMARK_MTORC1_SIGNALING 1.666299 0.5043537 0.0002052 0.0017100 0.5188481 180 MCM4/RRM2/PLK1/DHFR/MCM2/…
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.783066 0.6326189 0.0002650 0.0018928 0.4984931 60 AREG/LGALS1/TNFAIP3/JUN/TIMP1/…
HALLMARK_GLYCOLYSIS 1.645698 0.5158091 0.0008169 0.0051056 0.4772708 129 STMN1/CDK1/HMMR/NASP/AURKA/…
HALLMARK_SPERMATOGENESIS 1.611055 0.5784368 0.0039929 0.0174638 0.4070179 54 CDKN3/CCNB2/CDK1/NCAPH/KIF2C/…
HALLMARK_APOPTOSIS 1.589836 0.4989284 0.0041913 0.0174638 0.4070179 127 TOP2A/LGALS3/HMGB2/PMAIP1/BAX/…

11.1.3.1.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_TNFA_SIGNALING_VIA_NFKB 2.322689 0.6313845 0.0000000 0.0000000 0.8986712 140 KLF2/RHOB/CD69/JUN/ZBTB10/…
HALLMARK_INFLAMMATORY_RESPONSE 1.936725 0.5591517 0.0000070 0.0001753 0.6105269 95 CD69/EMP3/KLF6/MYC/NFKBIA/…
HALLMARK_P53_PATHWAY 1.738750 0.4725668 0.0000771 0.0012851 0.5384341 143 JUN/BAX/PPP1R15A/TOB1/FOS/…
HALLMARK_APOPTOSIS 1.742682 0.4860557 0.0002342 0.0029277 0.5188481 123 RHOB/CD69/JUN/PMAIP1/IRF1/…
HALLMARK_IL6_JAK_STAT3_SIGNALING 1.827284 0.5843341 0.0008299 0.0082987 0.4772708 49 JUN/STAT1/IRF1/SOCS1/IL10RB/…
HALLMARK_IL2_STAT5_SIGNALING 1.600515 0.4456460 0.0013672 0.0097654 0.4550599 124 RHOB/KLF6/MYC/PLIN2/IFITM3/…
HALLMARK_ESTROGEN_RESPONSE_EARLY 1.624859 0.4931969 0.0051265 0.0284806 0.4070179 67 MYC/AREG/TOB1/FOS/LRIG1/…
HALLMARK_HYPOXIA 1.464556 0.4187076 0.0141024 0.0705121 0.3807304 100 JUN/KLF6/PLIN2/PPP1R15A/FOS/…
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.564335 0.4953251 0.0173514 0.0722976 0.3524879 52 RHOB/JUN/EMP3/AREG/TGFBI/…
HALLMARK_ESTROGEN_RESPONSE_LATE 1.471464 0.4399010 0.0168521 0.0722976 0.3524879 73 AREG/FOS/SGK1/NXT1/LARGE1/…
HALLMARK_FATTY_ACID_METABOLISM -1.397312 -0.3773208 0.0240373 0.0924511 0.3524879 102 ACO2/NSDHL/ECI2/RETSAT/HSP90AA1/…
HALLMARK_OXIDATIVE_PHOSPHORYLATION -1.507742 -0.3757902 0.0009974 0.0083119 0.4550599 190 NDUFS3/ACO2/MTX2/CASP7/TOMM70/…

11.1.3.1.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_TNFA_SIGNALING_VIA_NFKB 2.162940 0.6192789 0.0000000 0.0000000 0.8012156 130 CD69/SGK1/SOCS3/NFKBIA/IFNGR2/…
HALLMARK_IL2_STAT5_SIGNALING 2.039593 0.5811499 0.0000000 0.0000007 0.7337620 136 PIM1/PTGER2/TNFRSF4/TNFRSF18/SOCS2/…
HALLMARK_INFLAMMATORY_RESPONSE 2.055382 0.6083283 0.0000002 0.0000032 0.6901325 103 CD69/PTGER2/NFKBIA/IFNGR2/OSM/…
HALLMARK_IL6_JAK_STAT3_SIGNALING 2.085060 0.6931223 0.0000017 0.0000208 0.6435518 50 PIM1/SOCS3/IFNGR2/JUN/TNFRSF1A/…
HALLMARK_HYPOXIA 1.952689 0.5823359 0.0000025 0.0000252 0.6272567 99 PIM1/FOS/JUN/ZFP36/LDHA/…
HALLMARK_UV_RESPONSE_UP 1.814924 0.5412512 0.0000541 0.0004512 0.5573322 99 NFKBIA/FOS/SIGMAR1/POLG2/JUNB/…
HALLMARK_ALLOGRAFT_REJECTION 1.738773 0.4936802 0.0000857 0.0006122 0.5384341 139 EIF5A/IFNGR2/IL16/CD40LG/SOCS1/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.697457 0.4750014 0.0001210 0.0007562 0.5384341 159 PIM1/CD69/ARID5B/SOCS3/NFKBIA/…
HALLMARK_ESTROGEN_RESPONSE_LATE 1.797445 0.5662766 0.0006332 0.0035175 0.4772708 69 SGK1/AREG/FOS/ZFP36/NXT1/…
HALLMARK_TGF_BETA_SIGNALING 1.630331 0.5733072 0.0053376 0.0242618 0.4070179 38 IFNGR2/ID3/JUNB/UBE2D3/SMURF2/…

11.1.3.1.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_COMPLEMENT 2.170216 0.6672156 0.0000000 0.0000006 0.7477397 137 CLU/PIM1/S100A12/S100A9/CTSD/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.993342 0.5952694 0.0000004 0.0000090 0.6749629 173 PIM1/FCGR1A/HLA-DRB1/NFKBIA/CXCL10/…
HALLMARK_APOPTOSIS 1.929190 0.6000240 0.0000246 0.0004097 0.5756103 125 CLU/GNA15/TXNIP/GADD45B/EMP1/…
HALLMARK_TNFA_SIGNALING_VIA_NFKB 1.837970 0.5585377 0.0001026 0.0012823 0.5384341 150 MAP3K8/BCL2A1/NFKBIA/CXCL10/MARCKS/…
HALLMARK_INFLAMMATORY_RESPONSE 1.844172 0.5735815 0.0001305 0.0013048 0.5188481 125 ADM/CD82/NFKBIA/CXCL10/GNA15/…
HALLMARK_CHOLESTEROL_HOMEOSTASIS 1.851895 0.6407820 0.0003801 0.0027152 0.4984931 61 CLU/FABP5/FADS2/FDPS/S100A11/…
HALLMARK_IL2_STAT5_SIGNALING 1.776333 0.5500254 0.0003491 0.0027152 0.4984931 130 PIM1/MAP3K8/CCND3/SLC39A8/CXCL10/…
HALLMARK_HYPOXIA 1.779982 0.5631392 0.0007326 0.0045790 0.4772708 109 PIM1/ADM/LDHA/SLC2A3/TGFBI/…
HALLMARK_COAGULATION 1.785994 0.6332655 0.0010480 0.0055257 0.4550599 53 CLU/GNB2/CTSB/SERPING1/MAFF/…
HALLMARK_MYOGENESIS 1.803834 0.5986199 0.0016842 0.0076553 0.4550599 78 CLU/TNNT1/GADD45B/MEF2C/IGFBP7/…

11.1.3.1.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_ESTROGEN_RESPONSE_LATE 1.798427 0.5196117 0.0047554 0.0792569 0.4070179 48 AREG/S100A9/TSPAN13/ISG20/ATP2B4/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.557464 0.3934974 0.0034506 0.0792569 0.4317077 128 IFI27/IFITM3/HLA-DQA1/FCGR1A/HLA-DRB1/…
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.798903 0.5420057 0.0074444 0.0930552 0.4070179 36 AREG/VCAN/JUN/WIPF1/BASP1
HALLMARK_OXIDATIVE_PHOSPHORYLATION -1.360880 -0.4361828 0.0041952 0.0792569 0.4070179 180 TCIRG1/ACO2/UQCRFS1/ATP6V1G1/NDUFA4/…

11.1.3.2 GO_BP

11.1.3.2.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.2.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.2.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.2.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOBP_MITOTIC_CELL_CYCLE_PROCESS 2.189977 0.6119397 0.0000000 0.0000000 1.3188888 560 STMN1/MKI67/UBE2C/CDC20/MCM4/…
GOBP_MITOTIC_NUCLEAR_DIVISION 2.265175 0.6666883 0.0000000 0.0000000 1.0476265 239 MKI67/UBE2C/CDC20/CENPF/TPX2/…
GOBP_ORGANELLE_FISSION 2.164888 0.6197788 0.0000000 0.0000000 1.0476265 346 MKI67/UBE2C/CDC20/TOP2A/CENPF/…
GOBP_SISTER_CHROMATID_SEGREGATION 2.270348 0.6893656 0.0000000 0.0000000 0.9325952 167 UBE2C/CDC20/TOP2A/CENPF/PLK1/…
GOBP_MITOTIC_SISTER_CHROMATID_SEGREGATION 2.229011 0.6878440 0.0000000 0.0000000 0.8634154 147 UBE2C/CDC20/CENPF/PLK1/PTTG1/…
GOBP_PROTEIN_DNA_COMPLEX_ASSEMBLY 2.183707 0.6743030 0.0000000 0.0000000 0.8390889 148 H3C2/CENPF/ASF1B/CENPN/CDC45/…
GOBP_MITOTIC_SPINDLE_ORGANIZATION 2.195368 0.6955760 0.0000000 0.0000005 0.7881868 109 STMN1/CDC20/TPX2/PLK1/KIF23/…
GOBP_REGULATION_OF_CHROMOSOME_SEGREGATION 2.226644 0.7419976 0.0000000 0.0000043 0.7477397 73 MKI67/UBE2C/CENPF/PLK1/ZWINT/…
GOBP_METAPHASE_ANAPHASE_TRANSITION_OF_CELL_CYCLE 2.157218 0.7545796 0.0000002 0.0000364 0.6901325 55 UBE2C/CENPF/PLK1/ZWINT/MAD2L1/…
GOBP_REGULATION_OF_CHROMOSOME_SEPARATION 2.158176 0.7428544 0.0000002 0.0000501 0.6749629 61 UBE2C/CENPF/PLK1/PTTG1/ZWINT/…
GOBP_PROCESS_UTILIZING_AUTOPHAGIC_MECHANISM -1.284567 -0.2997422 0.0020314 0.0825689 0.4317077 408 WDR45B/C9orf72/ERCC4/SIRT1/PSEN1/…
GOBP_POSITIVE_REGULATION_OF_PROTEIN_EXIT_FROM_ENDOPLASMIC_RETICULUM -1.788954 -0.8818964 0.0018853 0.0783333 0.4550599 7 TMEM30A/BCAP31/EDEM1
GOBP_NEGATIVE_REGULATION_OF_ACTIN_NUCLEATION -1.789769 -0.9239111 0.0007261 0.0370781 0.4772708 6 HIP1R/CORO1A/GMFB

11.1.3.2.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 1.941814 0.4773758 0.0000000 0.0000007 0.8390889 375 IGLV7-46/IGLV3-9/IGHV3-72/IGHV3-49/IGKV1D-17/…
GOBP_POSITIVE_REGULATION_OF_B_CELL_ACTIVATION 2.108893 0.5992717 0.0000000 0.0001095 0.7337620 107 IGHV3-72/IGHV3-49/IGHV6-1/PELI1/IGHV7-81/…
GOBP_HUMORAL_IMMUNE_RESPONSE_MEDIATED_BY_CIRCULATING_IMMUNOGLOBULIN 2.155604 0.6492334 0.0000001 0.0002414 0.7049757 75 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOBP_B_CELL_RECEPTOR_SIGNALING_PATHWAY 2.021007 0.5799128 0.0000006 0.0009960 0.6594444 101 IGHV3-72/IGHV3-49/IGHV6-1/BAX/IGHV7-81/…
GOBP_B_CELL_ACTIVATION 1.844998 0.4695816 0.0000007 0.0009960 0.6594444 244 IGHV3-72/IGHV3-49/KLF6/IGHV6-1/PELI1/…
GOBP_COMPLEMENT_ACTIVATION 2.107365 0.6394987 0.0000011 0.0011963 0.6435518 70 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOBP_PHAGOCYTOSIS_RECOGNITION 2.057115 0.6349051 0.0000050 0.0033870 0.6105269 65 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOBP_MEMBRANE_INVAGINATION 1.968101 0.5630140 0.0000050 0.0033870 0.6105269 102 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOBP_CELL_RECOGNITION 1.966266 0.5547062 0.0000041 0.0033870 0.6105269 112 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOBP_GLIAL_CELL_PROLIFERATION 1.963143 0.7576787 0.0002560 0.0796927 0.4984931 21 MYC/AREG/TSPO/PPP1CC/PLAG1/…
GOBP_REGULATION_OF_CARTILAGE_DEVELOPMENT -1.903577 -0.7564689 0.0002825 0.0821368 0.4984931 17 SMPD3/TRPS1/SMAD3/SOX5/GLG1/…
GOBP_CYTOPLASMIC_TRANSLATION -1.796263 -0.4698042 0.0000408 0.0203316 0.5573322 135 RPL41/RPS15A/RPS16/RPL37A/RPL29/…

11.1.3.2.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOBP_RESPONSE_TO_ORGANIC_CYCLIC_COMPOUND 1.892491 0.4842527 0.0000000 0.0000014 0.8140358 367 PIM1/SGK1/PTGER2/AREG/ZFP36L1/…
GOBP_RECEPTOR_SIGNALING_PATHWAY_VIA_STAT 2.033450 0.6192620 0.0000006 0.0003507 0.6594444 84 SOCS3/TNFRSF18/SOCS2/OSM/TNFRSF1A/…
GOBP_RESPONSE_TO_CORTICOSTEROID 2.075931 0.6506483 0.0000015 0.0005411 0.6435518 67 SGK1/AREG/ZFP36L1/FOS/ZFP36/…
GOBP_TYROSINE_PHOSPHORYLATION_OF_STAT_PROTEIN 2.018249 0.7167395 0.0000220 0.0053147 0.5756103 34 SOCS3/TNFRSF18/OSM/TNFRSF1A/SOCS1/…
GOBP_RESPONSE_TO_CAMP 1.988106 0.7060350 0.0000484 0.0097861 0.5573322 34 AREG/ZFP36L1/FDX1/ITPR2/RAP1A/…
GOBP_RESPONSE_TO_PURINE_CONTAINING_COMPOUND 1.952358 0.6489001 0.0000749 0.0137962 0.5384341 48 AREG/ZFP36L1/FOS/TRPM2/FDX1/…
GOBP_STEM_CELL_PROLIFERATION 1.910631 0.7318622 0.0001598 0.0225343 0.5188481 25 PIM1/ZFP36L1/N4BP2L2/LTBP3/EIF2AK2/…
GOBP_KERATINOCYTE_PROLIFERATION 1.910881 0.8309192 0.0002265 0.0268445 0.5188481 13 AREG/ZFP36L1/ZFP36/KLF9/BCL11B/…
GOBP_REGULATION_OF_KERATINOCYTE_PROLIFERATION 1.901463 0.8636862 0.0002377 0.0273346 0.5188481 11 AREG/ZFP36L1/ZFP36/KLF9/BCL11B/…
GOBP_REGULATION_OF_STEM_CELL_PROLIFERATION 1.932395 0.7924327 0.0004497 0.0409968 0.4984931 17 PIM1/ZFP36L1/N4BP2L2/LTBP3/EIF2AK2/…
GOBP_ALCOHOL_CATABOLIC_PROCESS -1.810971 -0.6549556 0.0019646 0.0926203 0.4317077 22 ALDH3B1/PTEN/GK/BPNT2/SYNJ1/…
GOBP_CEREBELLAR_CORTEX_DEVELOPMENT -1.847074 -0.6464517 0.0017256 0.0837711 0.4550599 24 TTC21B/WNT7A/CLP1/HERC1/RORA/…
GOBP_EXCITATORY_SYNAPSE_ASSEMBLY -1.825969 -0.8700692 0.0013714 0.0770854 0.4550599 7 PTEN/NPTN/WNT7A
GOBP_MITOCHONDRION_LOCALIZATION -1.839853 -0.5959025 0.0012277 0.0758517 0.4550599 33 KAT2A/FEZ1/SLC4A5/LRPPRC/MFN1/…
GOBP_HINDBRAIN_MORPHOGENESIS -1.880683 -0.6900828 0.0010563 0.0695336 0.4550599 20 TTC21B/WNT7A/HERC1/RORA/DAB1/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN_VIA_MHC_CLASS_IB -1.725770 -0.9280066 0.0007618 0.0575309 0.4772708 5 TAP2/HLA-G
GOBP_N_TERMINAL_PROTEIN_AMINO_ACID_MODIFICATION -1.925206 -0.6737969 0.0006795 0.0529149 0.4772708 24 HHAT/METAP1D/NAA30/NAA40/NAA15/…
GOBP_AXONEMAL_DYNEIN_COMPLEX_ASSEMBLY -1.854941 -0.8636938 0.0004593 0.0413711 0.4984931 8 DNAAF10/DNAI2/DNAAF5
GOBP_LENS_DEVELOPMENT_IN_CAMERA_TYPE_EYE -1.945022 -0.6627393 0.0002943 0.0305581 0.4984931 27 HIPK1/CRYBG3/SPRY1/WNT7A/TGFBR1/…
GOBP_INOSITOL_PHOSPHATE_METABOLIC_PROCESS -1.988218 -0.6906353 0.0002464 0.0279149 0.4984931 25 PPIP5K1/PTEN/BPNT2/SYNJ1/INPP4A

11.1.3.2.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOBP_HUMORAL_IMMUNE_RESPONSE 2.163674 0.6594244 0.0000001 0.0000942 0.7049757 130 CLU/RNASE2/S100A12/HLA-DRB1/S100A9/…
GOBP_ANTIMICROBIAL_HUMORAL_RESPONSE 2.153263 0.7718184 0.0000114 0.0040128 0.5933255 46 RNASE2/S100A12/S100A9/RNASE3/CXCL10/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_PEPTIDE_ANTIGEN_VIA_MHC_CLASS_II 2.096229 0.8575825 0.0000522 0.0091739 0.5573322 23 HLA-DRB1/HLA-DPB1/CTSD/HLA-DPA1/HLA-DRA/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_PEPTIDE_ANTIGEN 2.114744 0.8018881 0.0001064 0.0153056 0.5384341 32 HLA-DRB1/HLA-DPB1/CTSD/HLA-DPA1/HLA-DRA/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_ANTIGEN 2.097620 0.7686913 0.0001359 0.0183338 0.5188481 39 HLA-DRB1/HLA-DPB1/CTSD/HLA-DPA1/HLA-DRA/…
GOBP_DEFENSE_RESPONSE_TO_FUNGUS 2.079787 0.8436805 0.0001465 0.0183338 0.5188481 24 S100A8/S100A12/S100A9/MPO/CX3CR1/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_OR_POLYSACCHARIDE_ANTIGEN_VIA_MHC_CLASS_II 2.065245 0.8114667 0.0001416 0.0183338 0.5188481 27 HLA-DRB1/HLA-DPB1/CTSD/HLA-DPA1/HLA-DRA/…
GOBP_IMMUNOGLOBULIN_PRODUCTION_INVOLVED_IN_IMMUNOGLOBULIN_MEDIATED_IMMUNE_RESPONSE 2.035406 0.6981273 0.0001513 0.0183338 0.5188481 57 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
GOBP_DEFENSE_RESPONSE_TO_GRAM_NEGATIVE_BACTERIUM 2.036333 0.7868104 0.0002425 0.0266826 0.5188481 29 RNASE2/RNASE3/H2BC11/CD4/MPEG1/…
GOBP_PEPTIDE_ANTIGEN_ASSEMBLY_WITH_MHC_PROTEIN_COMPLEX 2.066526 0.8862317 0.0004514 0.0438580 0.4984931 17 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
GOBP_MRNA_PROCESSING -1.390379 -0.3320768 0.0007022 0.0571538 0.4772708 402 TENT4A/SRSF4/RRP1B/CDK11A/DHX15/…
GOBP_RNA_PROCESSING -1.327048 -0.3035917 0.0000869 0.0134643 0.5384341 781 TENT4A/SRSF4/PTCD1/METTL25B/RRP1B/…

11.1.3.2.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOBP_PEPTIDE_ANTIGEN_ASSEMBLY_WITH_MHC_CLASS_II_PROTEIN_COMPLEX 2.327248 0.8866218 0.0000186 0.0229055 0.5756103 14 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOBP_PEPTIDE_ANTIGEN_ASSEMBLY_WITH_MHC_PROTEIN_COMPLEX 2.307622 0.8416140 0.0000307 0.0289097 0.5573322 18 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOBP_DEFENSE_RESPONSE_TO_FUNGUS 2.280060 0.8686447 0.0000660 0.0430338 0.5384341 14 S100A8/S100A9/S100A12/CX3CR1
GOBP_NEUTROPHIL_CHEMOTAXIS 2.181337 0.7034697 0.0000832 0.0458773 0.5384341 33 S100A8/S100A9/S100A12/CD74
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_PEPTIDE_ANTIGEN_VIA_MHC_CLASS_II 2.221043 0.7725118 0.0001179 0.0497257 0.5384341 22 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_ANTIGEN 2.096504 0.6612864 0.0001518 0.0604769 0.5188481 37 HLA-DPB1/HLA-DQA1/CD1C/HLA-DRB1/HLA-DPA1/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_PEPTIDE_ANTIGEN 2.129781 0.6980586 0.0002221 0.0725941 0.5188481 30 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_OR_POLYSACCHARIDE_ANTIGEN_VIA_MHC_CLASS_II 2.147929 0.7386930 0.0002888 0.0767070 0.4984931 24 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOBP_RESPONSE_TO_FUNGUS 2.197113 0.7588724 0.0003386 0.0793038 0.4984931 23 S100A8/S100A9/S100A12/CX3CR1
GOBP_REGULATION_OF_VIRAL_ENTRY_INTO_HOST_CELL 2.112268 0.7346781 0.0005561 0.0972755 0.4772708 22 IFITM3/HLA-DRB1/IFITM1/CD74/LY6E/…
GOBP_RESPONSE_TO_NERVE_GROWTH_FACTOR -1.742844 -0.7272221 0.0004195 0.0869578 0.4984931 20 ACAP2/EIF4A3/ARF6/KIDINS220/APP/…
GOBP_TOOTH_MINERALIZATION -1.560006 -0.9840261 0.0003538 0.0793038 0.4984931 3 TCIRG1
GOBP_VITAMIN_D_RECEPTOR_SIGNALING_PATHWAY -1.626882 -0.9634434 0.0002885 0.0767070 0.4984931 4 RXRA/SNW1
GOBP_REGULATION_OF_RESPONSE_TO_EXTRACELLULAR_STIMULUS -1.686575 -0.9469819 0.0002329 0.0725941 0.5188481 5 RXRA/SNW1

11.1.3.3 GO_MF

11.1.3.3.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.3.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.3.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.3.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOMF_CYTOSKELETAL_MOTOR_ACTIVITY 1.986712 0.7037084 0.0000251 0.0167721 0.5756103 51 MYL6B/KIF23/KIFC1/CENPE/KIF2C/…
GOMF_MICROTUBULE_MOTOR_ACTIVITY 1.974092 0.7303336 0.0000292 0.0167721 0.5756103 38 KIF23/KIFC1/CENPE/KIF2C/KIF15/…
GOMF_MODIFIED_AMINO_ACID_BINDING 1.951044 0.7127251 0.0000401 0.0167721 0.5573322 42 TYMS/DHFR/UROS/SYTL2/PTGES2/…
GOMF_SINGLE_STRANDED_DNA_BINDING 1.832294 0.5844342 0.0000303 0.0167721 0.5756103 110 MCM4/CDC45/MCM7/MCM2/RAD51/…
GOMF_CYCLIN_DEPENDENT_PROTEIN_SERINE_THREONINE_KINASE_REGULATOR_ACTIVITY 1.944380 0.7381697 0.0002052 0.0687032 0.5188481 33 CKS2/CCNB2/CCNA2/CCNB1/CDKN1B/…
GOMF_CYSTEINE_TYPE_ENDOPEPTIDASE_INHIBITOR_ACTIVITY 1.900817 0.7730034 0.0005785 0.0811981 0.4772708 24 BIRC5/PTTG1/TNFSF14/PRDX3/XIAP/…
GOMF_TRANSLATION_REPRESSOR_ACTIVITY_MRNA_REGULATORY_ELEMENT_BINDING 1.865708 0.8725790 0.0004862 0.0811981 0.4984931 12 TYMS/DHFR/SHMT2
GOMF_DNA_REPLICATION_ORIGIN_BINDING 1.826807 0.8264052 0.0005821 0.0811981 0.4772708 15 CDC45/MCM2/MCM10/MCM5/HSPD1/…
GOMF_SINGLE_STRANDED_DNA_HELICASE_ACTIVITY 1.854026 0.7590126 0.0011215 0.0938730 0.4550599 23 MCM4/MCM7/MCM2/RAD51/MCM5/…
GOMF_TRANSLATION_REPRESSOR_ACTIVITY 1.845158 0.7867681 0.0012523 0.0998226 0.4550599 19 TYMS/DHFR/SHMT2
GOMF_STRUCTURAL_CONSTITUENT_OF_RIBOSOME -1.498186 -0.3895763 0.0009113 0.0897317 0.4772708 149 RPS4X/MRPL2/RPL11/RPL10A/RPS23/…
GOMF_CATALYTIC_ACTIVITY_ACTING_ON_RNA -1.382495 -0.3248788 0.0006476 0.0833946 0.4772708 330 EXOG/CNOT8/DDX46/FARSB/APEX1/…
GOMF_TELOMERASE_INHIBITOR_ACTIVITY -1.591999 -0.9807853 0.0004116 0.0811981 0.4984931 3 ERCC4/PIF1

11.1.3.3.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOMF_IMMUNOGLOBULIN_RECEPTOR_BINDING 2.223378 0.6808708 3.00e-07 0.0004202 0.6749629 61 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOMF_ANTIGEN_BINDING 1.865352 0.5144477 1.83e-05 0.0101870 0.5756103 128 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOMF_STRUCTURAL_CONSTITUENT_OF_RIBOSOME -1.979670 -0.5056111 1.00e-06 0.0008148 0.6435518 150 RPL41/RPS15A/RPS16/RPL37A/MRPS23/…

11.1.3.3.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOMF_SIGNALING_RECEPTOR_REGULATOR_ACTIVITY 2.032457 0.5963079 0.0000007 0.0005489 0.6594444 100 AREG/IL16/OSM/IL32/CD40LG/…
GOMF_TUMOR_NECROSIS_FACTOR_ACTIVATED_RECEPTOR_ACTIVITY 1.718074 0.9968417 0.0000004 0.0005489 0.6749629 4 TNFRSF4/TNFRSF18
GOMF_CYTOKINE_ACTIVITY 2.161221 0.7274334 0.0000022 0.0012280 0.6272567 42 IL16/OSM/IL32/CD40LG/GREM2/…
GOMF_CATALYTIC_ACTIVITY_ACTING_ON_A_NUCLEIC_ACID -1.364325 -0.2998334 0.0004322 0.0902781 0.4984931 493 DHX8/ALKBH1/POLR1E/AGO1/FTO/…
GOMF_STRUCTURAL_CONSTITUENT_OF_RIBOSOME -1.587901 -0.4017302 0.0002475 0.0590874 0.4984931 149 RPLP2/RPS7/RPL35/MRPS33/MRPS14/…
GOMF_INOSITOL_HEXAKISPHOSPHATE_KINASE_ACTIVITY -1.769559 -0.9491201 0.0001786 0.0497426 0.5188481 5 PPIP5K1
GOMF_1_PHOSPHATIDYLINOSITOL_4_KINASE_ACTIVITY -1.607687 -0.9900341 0.0001210 0.0404335 0.5384341 3 PI4KA/PI4KB

11.1.3.3.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOMF_MHC_PROTEIN_COMPLEX_BINDING 2.048490 0.7798483 0.0001983 0.0349329 0.5188481 30 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
GOMF_MHC_CLASS_II_PROTEIN_COMPLEX_BINDING 2.041239 0.8134498 0.0001137 0.0349329 0.5384341 24 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
GOMF_ENDONUCLEASE_ACTIVITY_ACTIVE_WITH_EITHER_RIBO_OR_DEOXYRIBONUCLEIC_ACIDS_AND_PRODUCING_3_PHOSPHOMONOESTERS 1.989349 0.9231102 0.0000922 0.0349329 0.5384341 12 RNASE2/RNASE1/RNASE4
GOMF_CALCIUM_ION_BINDING 1.763735 0.5082036 0.0000783 0.0349329 0.5384341 219 S100A8/S100A9/DYSF/ASPH/NOTCH2/…
GOMF_PEPTIDE_BINDING 1.742342 0.5194872 0.0001907 0.0349329 0.5188481 164 CLU/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/…
GOMF_MISFOLDED_PROTEIN_BINDING 1.988678 0.7831213 0.0002979 0.0453585 0.4984931 25 CLU/SDF2L1/HSPA5/DNAJB11/DNAJC10
GOMF_SIGNALING_RECEPTOR_REGULATOR_ACTIVITY 1.831525 0.5684759 0.0006799 0.0875983 0.4772708 111 RETN/ADM/CXCL10/IL1B/CXCL8/…
GOMF_IMMUNE_RECEPTOR_ACTIVITY 1.804870 0.6002018 0.0008141 0.0880927 0.4772708 70 FCGR1A/HLA-DRB1/HLA-DPA1/HLA-DRA/CX3CR1/…
GOMF_PEPTIDE_ANTIGEN_BINDING 1.943032 0.7944813 0.0009100 0.0896624 0.4772708 21 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DQB1/…
GOMF_ELECTRON_TRANSFER_ACTIVITY 1.791593 0.5657850 0.0010685 0.0994256 0.4550599 100 ASPH/MT-ND6/CYC1/NDUFB9/ETFB/…

11.1.3.3.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOMF_RAGE_RECEPTOR_BINDING 2.112261 0.9811787 0.0000035 0.0018163 0.6272567 6 S100A8/S100A12/HMGB2
GOMF_MHC_CLASS_II_RECEPTOR_ACTIVITY 2.263292 0.9449148 0.0000056 0.0021814 0.6105269 9 HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/HLA-DQB1/…
GOMF_MHC_CLASS_II_PROTEIN_COMPLEX_BINDING 2.387379 0.8256735 0.0000119 0.0037266 0.5933255 23 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOMF_MHC_PROTEIN_COMPLEX_BINDING 2.317274 0.7720742 0.0000212 0.0055387 0.5756103 27 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOMF_IMMUNE_RECEPTOR_ACTIVITY 2.172245 0.6511305 0.0000290 0.0064749 0.5756103 48 HLA-DQA1/FCGR1A/IFNGR2/HLA-DRB1/HLA-DPA1/…
GOMF_FATTY_ACID_BINDING 2.353652 0.8792839 0.0000439 0.0085849 0.5573322 15 S100A8/S100A9/PTGDS
GOMF_TOLL_LIKE_RECEPTOR_BINDING 2.211241 0.8792482 0.0000511 0.0088869 0.5573322 11 S100A8/S100A9/TLR1
GOMF_ANTIGEN_BINDING 2.131335 0.6375734 0.0001354 0.0201226 0.5188481 47 HLA-DPB1/HLA-DQA1/CD1C/HLA-DRB1/HLA-DPA1/…
GOMF_LIPOPEPTIDE_BINDING 2.093977 0.9369010 0.0001414 0.0201226 0.5188481 7 CD1C/CD14/CD1D/CD1E
GOMF_PEPTIDE_ANTIGEN_BINDING 2.084922 0.7721010 0.0004730 0.0569361 0.4984931 16 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOMF_PROTON_TRANSMEMBRANE_TRANSPORTER_ACTIVITY -1.589458 -0.5593554 0.0006385 0.0713799 0.4772708 65 TCIRG1/UQCRFS1/ATP6V1G1/UQCR10/DMAC2L/…

11.1.3.4 GO_CC

11.1.3.4.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.4.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.4.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.4.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOCC_CHROMOSOMAL_REGION 2.123823 0.6151169 0.0000000 0.0000000 0.9653278 295 BIRC5/CENPM/MCM4/TOP2A/CENPF/…
GOCC_CONDENSED_CHROMOSOME 2.234498 0.6674972 0.0000000 0.0000000 0.9545416 199 MKI67/BIRC5/CENPM/TOP2A/CENPF/…
GOCC_NUCLEAR_CHROMOSOME 2.139723 0.6447094 0.0000000 0.0000000 0.8390889 180 BIRC5/MCM4/H3C2/TOP2A/PLK1/…
GOCC_CONDENSED_CHROMOSOME_CENTROMERIC_REGION 2.147046 0.6638263 0.0000000 0.0000001 0.8140358 143 BIRC5/CENPM/CENPF/CENPU/PLK1/…
GOCC_CHROMOSOME_CENTROMERIC_REGION 2.036108 0.6205074 0.0000000 0.0000036 0.7337620 161 BIRC5/CENPM/CENPF/CENPU/PLK1/…
GOCC_MITOTIC_SPINDLE 1.954834 0.6148481 0.0000008 0.0000761 0.6594444 124 TPX2/PLK1/CKAP2L/CDK1/MAD2L1/…
GOCC_DNA_PACKAGING_COMPLEX 2.007591 0.6538712 0.0000015 0.0001256 0.6435518 92 H3C2/H2AZ1/H4C3/H1-2/NCAPG/…
GOCC_SPINDLE_POLE 1.933980 0.6001730 0.0000021 0.0001566 0.6272567 137 CDC20/CENPF/TPX2/PLK1/CKAP2L/…
GOCC_CYCLIN_DEPENDENT_PROTEIN_KINASE_HOLOENZYME_COMPLEX 2.005038 0.7465879 0.0000299 0.0013370 0.5756103 39 PCNA/CKS2/CCNB2/CDK1/CCNA2/…
GOCC_T_CELL_RECEPTOR_COMPLEX 1.966689 0.6708984 0.0000287 0.0013370 0.5756103 67 TRAV38-2DV8/TRGV5/TRAV1-1/TRBV20-1/TRBV7-9/…
GOCC_RIBOSOMAL_SUBUNIT -1.559063 -0.3980319 0.0006768 0.0190078 0.4772708 161 HBA1/RPS4X/MRPL2/RPL11/RPL10A/…
GOCC_AIM2_INFLAMMASOME_COMPLEX -1.657264 -0.9593314 0.0005532 0.0163421 0.4772708 4 CASP4
GOCC_CYTOSOLIC_RIBOSOME -1.715178 -0.4735702 0.0004986 0.0158102 0.4772708 90 HBA1/RPS4X/RPL11/RPL10A/RPS23/…
GOCC_RIBOSOME -1.545687 -0.3855265 0.0002511 0.0094941 0.4984931 192 HBA1/RPS4X/MRPL2/RPL11/APEX1/…
GOCC_POLYSOME -1.841048 -0.5500895 0.0002322 0.0091308 0.5188481 58 RPS4X/RPL11/MCRS1/RPL10A/RPS23/…

11.1.3.4.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOCC_IMMUNOGLOBULIN_COMPLEX 2.488389 0.6890838 0.0000000 0.0000000 0.9865463 124 IGLV7-46/IGLV3-9/IGHV3-72/IGHV3-49/IGLV2-18/…
GOCC_IMMUNOGLOBULIN_COMPLEX_CIRCULATING 2.234906 0.6869169 0.0000003 0.0001321 0.6749629 61 IGHV3-72/IGHV3-49/IGHV6-1/IGHV7-81/IGHV2-70/…
GOCC_TRANSCRIPTION_FACTOR_AP_1_COMPLEX 1.788564 0.9770407 0.0000151 0.0024897 0.5933255 5 JUN/JUND/FOS/JUNB
GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE 1.636280 0.4250024 0.0001710 0.0188108 0.5188481 211 CD69/IGHV3-72/IGHV3-49/IGHV6-1/S1PR1/…
GOCC_CELL_SURFACE 1.425334 0.3504498 0.0007571 0.0681356 0.4772708 362 CD69/IGHV3-72/IGHV3-49/IGHV6-1/S1PR1/…
GOCC_MEMBRANE_PROTEIN_COMPLEX -1.349865 -0.2978227 0.0004103 0.0406191 0.4984931 662 NDUFS3/LIN7C/DNAJC11/HLA-DQA1/NCF4/…
GOCC_RIBOSOME -1.708962 -0.4277936 0.0000306 0.0037919 0.5573322 192 RPL41/MRPL45/MRPL38/RPS15A/RPS16/…
GOCC_CYTOSOLIC_RIBOSOME -1.910424 -0.5256877 0.0000258 0.0036451 0.5756103 91 RPL41/RPS15A/RPS16/RPL37A/RPL29/…
GOCC_LARGE_RIBOSOMAL_SUBUNIT -1.948565 -0.5290340 0.0000053 0.0011309 0.6105269 104 RPL41/MRPL45/MRPL38/RPL29/UBA52/…
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT -2.069381 -0.6357955 0.0000057 0.0011309 0.6105269 50 RPL41/RPL37A/RPL29/UBA52/RPL34/…
GOCC_RIBOSOMAL_SUBUNIT -1.916937 -0.4893259 0.0000009 0.0002953 0.6594444 161 RPL41/MRPL45/MRPL38/RPS15A/RPS16/…

11.1.3.4.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOCC_CELL_SURFACE 1.735673 0.4506186 0.0000002 0.0002110 0.6901325 357 CD69/TNFRSF4/AREG/TNFRSF18/FLOT1/…
GOCC_MEMBRANE_MICRODOMAIN 1.805244 0.5081181 0.0000083 0.0040672 0.5933255 154 FLOT1/TNFRSF1A/HMOX1/SLC25A5/ADTRP/…
GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE 1.711477 0.4659494 0.0000205 0.0067246 0.5756103 212 CD69/TNFRSF4/TNFRSF18/FLOT1/LAG3/…
GOCC_T_CELL_RECEPTOR_COMPLEX 1.753329 0.5217590 0.0001415 0.0278002 0.5188481 98 TRBV6-6/TRBV7-7/TRGV2/TRAV13-1/TRBV11-1/…
GOCC_TRANSCRIPTION_FACTOR_AP_1_COMPLEX 1.686755 0.9375073 0.0004197 0.0515227 0.4984931 5 FOS/JUN/JUNB
GOCC_CORTICAL_CYTOSKELETON 1.763539 0.5885794 0.0006032 0.0658159 0.4772708 49 CLDN5/FLOT1/GYPC/MYADM/COTL1/…
GOCC_TERTIARY_GRANULE_MEMBRANE 1.791363 0.6048151 0.0009715 0.0954010 0.4772708 46 CEACAM1/CYBA/TRPM2/SLC2A3/FCER1G/…
GOCC_AMPA_GLUTAMATE_RECEPTOR_COMPLEX 1.651277 0.9177881 0.0013888 0.0956082 0.4550599 5 ABHD12/OLFM2/DLG3
GOCC_ANNULATE_LAMELLAE 1.648001 0.9159676 0.0015081 0.0956082 0.4550599 5 EIF5A/XPO1/TNPO3
GOCC_CORTICAL_ACTIN_CYTOSKELETON 1.723687 0.6035615 0.0016765 0.0968414 0.4550599 37 CLDN5/FLOT1/MYADM/COTL1/CAPN2/…
GOCC_ENDOPLASMIC_RETICULUM_TUBULAR_NETWORK_MEMBRANE -1.633336 -0.9499817 0.0015578 0.0956082 0.4550599 4 ATL3/LNPK
GOCC_ALPHA_DNA_POLYMERASE_PRIMASE_COMPLEX -1.640033 -0.9538771 0.0013196 0.0956082 0.4550599 4 POLA1/PRIM2
GOCC_CYTOSOLIC_RIBOSOME -1.671907 -0.4522215 0.0012793 0.0956082 0.4550599 88 RPLP2/RPS7/RPL35/RPL6/RPL23A/…

11.1.3.4.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOCC_VESICLE_LUMEN 2.078219 0.5956180 0.0000000 0.0000010 0.7614608 235 CLU/RETN/HP/RNASE2/S100A8/…
GOCC_EXTERNAL_ENCAPSULATING_STRUCTURE 2.173879 0.6683608 0.0000003 0.0000567 0.6749629 117 CLU/S100A8/S100A9/CTSD/LGALS1/…
GOCC_VACUOLAR_LUMEN 2.116920 0.6518459 0.0000013 0.0002170 0.6435518 115 RETN/RNASE2/PLAC8/CTSD/MPO/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE_MEMBRANE 2.075834 0.7476886 0.0000235 0.0023517 0.5756103 41 FCGR1A/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/…
GOCC_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX 2.036634 0.6476584 0.0000216 0.0023517 0.5756103 95 S100A8/S100A9/CTSD/LGALS1/FGL2/…
GOCC_MHC_CLASS_II_PROTEIN_COMPLEX 1.984915 0.8989870 0.0000564 0.0046474 0.5573322 14 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
GOCC_BLOOD_MICROPARTICLE 1.983022 0.6843357 0.0001224 0.0093110 0.5188481 54 CLU/HP/PFN1/IGLV3-21/ACTB/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE 1.977163 0.6870066 0.0002900 0.0179254 0.4984931 52 FCGR1A/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/…
GOCC_LUMENAL_SIDE_OF_MEMBRANE 1.978420 0.7665233 0.0003744 0.0217799 0.4984931 28 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DQB1/…
GOCC_MHC_PROTEIN_COMPLEX 1.985914 0.8095036 0.0005036 0.0262136 0.4772708 22 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
GOCC_STEREOCILIUM_BUNDLE -2.048812 -0.7930946 0.0000304 0.0027319 0.5756103 20 MYO1C/FCHSD2/TRIOBP/ELMOD3/PAFAH1B1

11.1.3.4.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOCC_CYTOSOLIC_RIBOSOME 2.552043 0.6868500 0.0000000 0.0000000 0.8634154 88 RPS3A/RPS15A/EIF2A/RPL4/RPL8/…
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT 2.435980 0.7259745 0.0000002 0.0000822 0.6901325 48 RPL4/RPL10/RPL8/RPL34/RPL26/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE_MEMBRANE 2.491602 0.8109419 0.0000003 0.0000862 0.6749629 30 AREG/HLA-DPB1/HLA-DQA1/FCGR1A/HLA-DRB1/…
GOCC_CLATHRIN_COATED_VESICLE_MEMBRANE 2.414883 0.7280867 0.0000015 0.0003474 0.6435518 45 AREG/HLA-DPB1/HLA-DQA1/FCGR1A/HLA-DRB1/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE 2.414769 0.7491360 0.0000026 0.0004912 0.6272567 37 AREG/HLA-DPB1/HLA-DQA1/FCGR1A/HLA-DRB1/…
GOCC_MHC_CLASS_II_PROTEIN_COMPLEX 2.389169 0.8919006 0.0000128 0.0015205 0.5933255 15 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOCC_MHC_PROTEIN_COMPLEX 2.181840 0.7628724 0.0002100 0.0132288 0.5188481 21 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOCC_CYTOSOLIC_SMALL_RIBOSOMAL_SUBUNIT 2.169121 0.6713940 0.0001849 0.0132288 0.5188481 38 RPS3A/RPS15A/EIF2A/RPS27A/ISG15/…
GOCC_ER_TO_GOLGI_TRANSPORT_VESICLE_MEMBRANE 2.160740 0.6572765 0.0001811 0.0132288 0.5188481 42 AREG/HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/…
GOCC_LUMENAL_SIDE_OF_MEMBRANE 2.144702 0.7323909 0.0009136 0.0435769 0.4772708 24 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
GOCC_INTRINSIC_COMPONENT_OF_POSTSYNAPTIC_MEMBRANE -1.677693 -0.8459252 0.0016450 0.0634290 0.4550599 8 NPTN/CANX/CHRNB1/GABBR1
GOCC_U2_TYPE_SPLICEOSOMAL_COMPLEX -1.522253 -0.5225778 0.0013099 0.0566509 0.4550599 82 CASC3/SNW1/CWC15/EIF4A3/TXNL4A/…
GOCC_PROTEASOME_COMPLEX -1.587913 -0.5778117 0.0012897 0.0566509 0.4550599 50 PSMB5/PSMC3/PSMB8/PSMA5/PAAF1/…
GOCC_PEPTIDASE_COMPLEX -1.592547 -0.5487597 0.0002357 0.0132288 0.5188481 79 SUPT20H/PSMB5/PSMC3/ENY2/PSMB8/…
GOCC_ENDOPEPTIDASE_COMPLEX -1.642985 -0.5814710 0.0001667 0.0132288 0.5188481 62 PSMB5/PSMC3/PSMB8/PSMA5/CAPN2/…
GOCC_CATALYTIC_COMPLEX -1.337776 -0.4097882 0.0000087 0.0013807 0.5933255 788 WWP2/CCNT1/CASC3/SNW1/FBXL15/…

11.1.3.5 CP_REACTOME

11.1.3.5.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.5.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.5.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.5.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
REACTOME_CELL_CYCLE 2.289754 0.6357001 0.0000000 0.0000000 1.5092628 604 TYMS/TK1/UBE2C/CDC20/BIRC5/…
REACTOME_CELL_CYCLE_MITOTIC 2.380680 0.6684330 0.0000000 0.0000000 1.4954793 474 TYMS/TK1/UBE2C/CDC20/BIRC5/…
REACTOME_CELL_CYCLE_CHECKPOINTS 2.288546 0.6742240 0.0000000 0.0000000 1.0476265 238 CDC20/CENPM/MCM4/CENPF/CLSPN/…
REACTOME_MITOTIC_G1_PHASE_AND_G1_S_TRANSITION 2.409149 0.7570970 0.0000000 0.0000000 1.0175448 122 TYMS/TK1/MCM4/TOP2A/RRM2/…
REACTOME_MITOTIC_METAPHASE_AND_ANAPHASE 2.229852 0.6634384 0.0000000 0.0000000 0.9759947 215 UBE2C/CDC20/BIRC5/CENPM/CENPF/…
REACTOME_RESOLUTION_OF_SISTER_CHROMATID_COHESION 2.353102 0.7536475 0.0000000 0.0000000 0.9436322 109 CDC20/BIRC5/CENPM/CENPF/CENPU/…
REACTOME_SEPARATION_OF_SISTER_CHROMATIDS 2.242889 0.6828669 0.0000000 0.0000000 0.9325952 171 UBE2C/CDC20/BIRC5/CENPM/CENPF/…
REACTOME_MITOTIC_PROMETAPHASE 2.229588 0.6735329 0.0000000 0.0000000 0.9214260 184 CDC20/BIRC5/CENPM/CENPF/CENPU/…
REACTOME_MITOTIC_SPINDLE_CHECKPOINT 2.311261 0.7428671 0.0000000 0.0000000 0.8986712 104 UBE2C/CDC20/BIRC5/CENPM/CENPF/…
REACTOME_RHO_GTPASES_ACTIVATE_FORMINS 2.230452 0.7034458 0.0000000 0.0000000 0.8266573 119 CDC20/BIRC5/CENPM/CENPF/CENPU/…
REACTOME_SYNTHESIS_OF_BILE_ACIDS_AND_BILE_SALTS -1.734441 -0.6925158 0.0064504 0.0685872 0.4070179 14 AKR1C3/OSBPL9/NCOA1/SCP2
REACTOME_ACYL_CHAIN_REMODELLING_OF_PE -1.757069 -0.7851604 0.0056879 0.0639073 0.4070179 10 PLBD1/ABHD4/PLA2G12A/PLA2G6/LPCAT3
REACTOME_ACTIVATION_OF_THE_MRNA_UPON_BINDING_OF_THE_CAP_BINDING_COMPLEX_AND_EIFS_AND_SUBSEQUENT_BINDING_TO_43S -1.710116 -0.5106442 0.0055110 0.0623656 0.4070179 52 RPS4X/EIF3G/RPS23/EIF3D/RPS5/…
REACTOME_SLC_TRANSPORTER_DISORDERS -1.714183 -0.5072328 0.0047103 0.0544806 0.4070179 56 SLC29A3/TPR/NUP85/POM121/SLC22A5/…
REACTOME_REGULATION_OF_GLUCOKINASE_BY_GLUCOKINASE_REGULATORY_PROTEIN -1.834728 -0.6171543 0.0030878 0.0416451 0.4317077 29 TPR/NUP85/POM121/NUP50/NDC1/…
REACTOME_SUMOYLATION_OF_SUMOYLATION_PROTEINS -1.875133 -0.6100965 0.0016279 0.0256061 0.4550599 35 TPR/NUP85/POM121/PIAS4/NUP50/…
REACTOME_TRNA_PROCESSING_IN_THE_NUCLEUS -1.776648 -0.5269802 0.0012447 0.0208296 0.4550599 58 TPR/NUP85/POM121/TSEN54/ELAC2/…
REACTOME_SUMOYLATION_OF_UBIQUITINYLATION_PROTEINS -1.844694 -0.5917798 0.0008424 0.0155356 0.4772708 39 TPR/NUP85/POM121/PIAS4/NUP50/…
REACTOME_VIRAL_MESSENGER_RNA_SYNTHESIS -1.921465 -0.5992917 0.0006187 0.0128047 0.4772708 42 TPR/NUP85/POM121/POLR2C/POLR2D/…
REACTOME_HEME_SIGNALING -1.974960 -0.6382712 0.0001865 0.0048907 0.5188481 37 HBA1/SIRT1/NCOA1/CHD9/MEF2D/…

11.1.3.5.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 2.065086 0.6687077 0.0000078 0.0015564 0.5933255 54 IGHV1-46/IGHV2-70/IGLV2-23/IGHG1/IGKV1D-16/…
REACTOME_FCGR_ACTIVATION 2.052936 0.6418801 0.0000079 0.0015564 0.5933255 61 IGHV1-46/IGHV2-70/IGLV2-23/IGHG1/IGKV1D-16/…
REACTOME_FCGR3A_MEDIATED_IL10_SYNTHESIS 1.992534 0.5979952 0.0000068 0.0015564 0.6105269 77 IGHV1-46/IGHV2-70/IGLV2-23/IGHG1/IGKV1D-16/…
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 2.057428 0.6571009 0.0000097 0.0016881 0.5933255 57 IGHV1-46/IGHV2-70/IGLV2-23/IGHG1/IGKV1D-16/…
REACTOME_COMPLEMENT_CASCADE 2.001713 0.6180322 0.0000158 0.0024712 0.5756103 66 IGHV1-46/IGHV2-70/IGLV2-23/IGHG1/IGKV1D-16/…
REACTOME_ROLE_OF_LAT2_NTAL_LAB_ON_CALCIUM_MOBILIZATION 2.002822 0.6258283 0.0000207 0.0025407 0.5756103 63 IGHV1-46/PDPK1/IGHV2-70/IGLV2-23/IGKV1D-16/…
REACTOME_FCERI_MEDIATED_MAPK_ACTIVATION 1.979456 0.5991667 0.0000180 0.0025407 0.5756103 72 JUN/IGHV1-46/FOS/IGHV2-70/IGLV2-23/…
REACTOME_FCERI_MEDIATED_CA_2_MOBILIZATION 1.878472 0.5645345 0.0000844 0.0066104 0.5384341 75 IGHV1-46/IGHV2-70/IGLV2-23/IGKV1D-16/IGKV1D-12/…
REACTOME_ROLE_OF_PHOSPHOLIPIDS_IN_PHAGOCYTOSIS 1.856106 0.5610546 0.0001777 0.0132240 0.5188481 73 IGHV1-46/IGHV2-70/IGLV2-23/IGHG1/IGKV1D-16/…
REACTOME_SCAVENGING_OF_HEME_FROM_PLASMA 1.884746 0.6155340 0.0003831 0.0240119 0.4984931 50 IGHV1-46/IGHV2-70/IGKV1D-16/IGKV1D-12/IGHV4-34/…
REACTOME_MICRORNA_MIRNA_BIOGENESIS -1.875600 -0.6782889 0.0017554 0.0859580 0.4550599 23 AGO2/POLR2K/POLR2D/POLR2G/XPO5/…
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -1.889937 -0.5088080 0.0000572 0.0049789 0.5573322 100 RPL41/RPS15A/RPS16/TRIB3/RPL37A/…
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION -1.924067 -0.5220484 0.0000507 0.0047665 0.5573322 95 RPL41/RPS15A/EIF3K/RPS16/RPL37A/…
REACTOME_PIWI_INTERACTING_RNA_PIRNA_BIOGENESIS -2.039703 -0.8056334 0.0000517 0.0047665 0.5573322 16 POLR2K/HSP90AA1/PLD6/POLR2D/HENMT1/…
REACTOME_SELENOAMINO_ACID_METABOLISM -1.930393 -0.5204926 0.0000211 0.0025407 0.5756103 99 RPL41/RPS15A/RPS16/RPL37A/RPL29/…
REACTOME_NONSENSE_MEDIATED_DECAY_NMD -1.954446 -0.5186864 0.0000046 0.0014550 0.6105269 111 RPL41/RPS15A/RPS16/RPL37A/RPL29/…
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -2.064545 -0.5685296 0.0000038 0.0014550 0.6272567 87 RPL41/RPS15A/RPS16/RPL37A/RPL29/…
REACTOME_INFLUENZA_INFECTION -1.931008 -0.4911956 0.0000018 0.0010311 0.6435518 150 KPNA4/RPL41/POLR2K/HSP90AA1/RPS15A/…
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -1.999921 -0.5349662 0.0000020 0.0010311 0.6272567 108 RPL41/SPCS2/RPS15A/SSR3/RPS16/…
REACTOME_TRANSLATION -1.822634 -0.4332126 0.0000004 0.0006434 0.6749629 279 RPL41/MRPL45/PTCD3/TRMT112/MRPL38/…

11.1.3.5.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
REACTOME_INTERLEUKIN_4_AND_INTERLEUKIN_13_SIGNALING 2.294714 0.7275841 0.0000000 0.0000016 0.7881868 61 PIM1/SOCS3/OSM/FOS/SOCS1/…
REACTOME_NGF_STIMULATED_TRANSCRIPTION 2.171093 0.8327271 0.0000005 0.0002587 0.6594444 23 SGK1/ID3/FOS/JUNB/EGR1
REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM 1.644510 0.4147830 0.0000005 0.0002587 0.6594444 485 PIM1/TNFRSF4/SOCS3/NFKBIA/IFNGR2/…
REACTOME_SIGNALING_BY_INTERLEUKINS 1.752203 0.4552325 0.0000023 0.0009181 0.6272567 299 PIM1/SOCS3/NFKBIA/SOCS2/IL16/…
REACTOME_TNFS_BIND_THEIR_PHYSIOLOGICAL_RECEPTORS 1.971908 0.8704376 0.0000454 0.0142974 0.5573322 11 TNFRSF4/TNFRSF18/TNFRSF1A/EDA
REACTOME_RUNX1_REGULATES_EXPRESSION_OF_COMPONENTS_OF_TIGHT_JUNCTIONS 1.614547 0.9928643 0.0000952 0.0249627 0.5384341 3 CLDN5/CBFB
REACTOME_NUCLEAR_EVENTS_KINASE_AND_TRANSCRIPTION_FACTOR_ACTIVATION 1.907472 0.6571611 0.0002072 0.0465805 0.5188481 39 SGK1/ID3/FOS/JUNB/EGR1
REACTOME_SIGNALING_BY_BMP 1.941446 0.8074343 0.0003128 0.0615510 0.4984931 14 GREM2/UBE2D3/SMURF2/SMAD7/NOG/…

11.1.3.5.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
REACTOME_NEUTROPHIL_DEGRANULATION 1.913036 0.5267514 0.0000000 0.0000026 0.7749390 390 RETN/HP/RNASE2/S100A8/PLAC8/…
REACTOME_MHC_CLASS_II_ANTIGEN_PRESENTATION 2.136595 0.6914075 0.0000031 0.0009637 0.6272567 85 HLA-DRB1/HLA-DPB1/CTSD/HLA-DPA1/HLA-DRA/…
REACTOME_ANTIMICROBIAL_PEPTIDES 2.027888 0.8628127 0.0000105 0.0027300 0.5933255 20 CLU/S100A8/S100A9/RNASE3/CD4/…
REACTOME_TCR_SIGNALING 1.946730 0.6130937 0.0000589 0.0131838 0.5573322 105 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/NFKBIA/…
REACTOME_GENERATION_OF_SECOND_MESSENGER_MOLECULES 1.964273 0.7674385 0.0001058 0.0207243 0.5384341 29 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DQB1/…
REACTOME_ANTIGEN_PROCESSING_CROSS_PRESENTATION 1.919823 0.6190395 0.0001929 0.0335851 0.5188481 89 S100A8/FCGR1A/S100A9/PSMB6/PSME2/…
REACTOME_COSTIMULATION_BY_THE_CD28_FAMILY 1.960511 0.6829988 0.0003529 0.0480993 0.4984931 54 MAP3K8/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/…
REACTOME_INTERFERON_GAMMA_SIGNALING 1.905652 0.6396230 0.0003174 0.0480993 0.4984931 68 FCGR1A/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/…
REACTOME_PD_1_SIGNALING 1.960298 0.8610723 0.0006475 0.0676439 0.4772708 17 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DQB1/…
REACTOME_RHO_GTPASES_ACTIVATE_NADPH_OXIDASES 1.907998 0.8570680 0.0007417 0.0726438 0.4772708 14 S100A8/S100A9/RAC2/CYBA

11.1.3.5.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION 2.927848 0.7921139 0.0000000 0.0000000 1.1512205 85 RPS3A/EEF2/RPS15A/EEF1A1/RPL4/…
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY 2.464335 0.6602788 0.0000000 0.0000001 0.8266573 92 RPS3A/RPS15A/RPL4/RPL10/RPL8/…
REACTOME_SELENOAMINO_ACID_METABOLISM 2.417585 0.6477531 0.0000000 0.0000003 0.8012156 92 RPS3A/RPS15A/PAPSS1/RPL4/RPL10/…
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION 2.304713 0.6175108 0.0000000 0.0000070 0.7337620 92 RPS3A/EIF4B/EIF4A2/RPS15A/RPL4/…
REACTOME_IRAK4_DEFICIENCY_TLR2_4 2.435762 0.9374072 0.0000007 0.0001828 0.6594444 12 S100A8/S100A9/CD14/BTK
REACTOME_DISEASES_OF_IMMUNE_SYSTEM 2.385226 0.8406022 0.0000066 0.0009463 0.6105269 19 S100A8/S100A9/CD14/NFKB1
REACTOME_REGULATION_OF_TLR_BY_ENDOGENOUS_LIGAND 2.308998 0.9370186 0.0000069 0.0009463 0.6105269 10 S100A8/S100A9/CD14
REACTOME_PD_1_SIGNALING 2.269443 0.8676930 0.0000624 0.0052552 0.5384341 13 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
REACTOME_ANTIMICROBIAL_PEPTIDES 2.196307 0.8397303 0.0001972 0.0119500 0.5188481 13 S100A8/S100A9/CLU/CD4
REACTOME_GENERATION_OF_SECOND_MESSENGER_MOLECULES 2.162489 0.7621052 0.0004766 0.0169423 0.4984931 19 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
REACTOME_MATURATION_OF_SARS_COV_1_NUCLEOPROTEIN -1.630307 -0.8722066 0.0035578 0.0550004 0.2798657 6 PARP4/SUMO1/PARP9
REACTOME_PYRUVATE_METABOLISM -1.750079 -0.7666272 0.0005338 0.0179708 0.4772708 15 RXRA/MPC2/PDHX/PDPR/PDK4/…
REACTOME_REGULATION_OF_PTEN_STABILITY_AND_ACTIVITY -1.654283 -0.5920429 0.0004374 0.0169423 0.4984931 55 WWP2/PSMB5/PSMC3/PSMB8/PSMA5/…
REACTOME_VISUAL_PHOTOTRANSDUCTION -1.736885 -0.7713666 0.0003413 0.0149190 0.4984931 14 CALM1/DHRS3/LRP1/METAP1/PLB1/…
REACTOME_REGULATION_OF_RUNX3_EXPRESSION_AND_ACTIVITY -1.633873 -0.5967284 0.0003154 0.0144806 0.4984931 47 PSMB5/PSMC3/PSMB8/PSMA5/UBC/…
REACTOME_PYRUVATE_METABOLISM_AND_CITRIC_ACID_TCA_CYCLE -1.716785 -0.6733585 0.0003053 0.0144806 0.4984931 28 RXRA/MPC2/ACO2/SDHD/PDHX/…
REACTOME_ORC1_REMOVAL_FROM_CHROMATIN -1.637401 -0.5904853 0.0002710 0.0136872 0.4984931 52 PSMB5/PSMC3/PSMB8/PSMA5/UBC/…
REACTOME_REGULATION_OF_RUNX2_EXPRESSION_AND_ACTIVITY -1.685489 -0.6131684 0.0001194 0.0086168 0.5384341 48 PSMC3/PSMB8/PSMA5/UBC/NR3C1/…
REACTOME_G2_M_CHECKPOINTS -1.634917 -0.5546823 0.0000368 0.0042912 0.5573322 89 YWHAG/PSMB5/PSMC3/PSMB8/PSMA5/…
REACTOME_CYTOPROTECTION_BY_HMOX1 -1.660849 -0.5613173 0.0000320 0.0040450 0.5573322 94 RXRA/PSMB5/PSMC3/NDUFA4/BACH1/…

11.1.3.6 CP_KEGG

11.1.3.6.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.6.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.6.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.6.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
KEGG_CELL_CYCLE 2.368696 0.7573590 0.0000000 0.0000000 0.9653278 107 CDC20/PLK1/PTTG1/PCNA/CDC45/…
KEGG_P53_SIGNALING_PATHWAY 2.213721 0.7785267 0.0000000 0.0000009 0.7477397 56 RRM2/CCNB2/CHEK1/CDK1/PMAIP1/…
KEGG_OOCYTE_MEIOSIS 2.073790 0.6980852 0.0000008 0.0000466 0.6594444 76 CDC20/PLK1/PTTG1/PKMYT1/CCNB2/…
KEGG_DNA_REPLICATION 2.033468 0.7982402 0.0000359 0.0013342 0.5573322 30 MCM4/PCNA/MCM7/MCM2/FEN1/…
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 1.913613 0.6293824 0.0000315 0.0013342 0.5573322 86 H3C2/H2AZ1/H4C3/H2AX/H2AC16/…
KEGG_ONE_CARBON_POOL_BY_FOLATE 1.901350 0.8649804 0.0001015 0.0027555 0.5384341 15 TYMS/DHFR/SHMT2/MTHFD1/MTHFD2/…
KEGG_PYRIMIDINE_METABOLISM 1.862932 0.6292376 0.0001037 0.0027555 0.5384341 74 TYMS/TK1/RRM2/DUT/RRM1/…
KEGG_PROGESTERONE_MEDIATED_OOCYTE_MATURATION 1.881709 0.6527769 0.0002091 0.0048611 0.5188481 59 PLK1/PKMYT1/CCNB2/CDK1/MAD2L1/…
KEGG_PATHWAYS_IN_CANCER 1.598302 0.4809982 0.0008310 0.0171738 0.4772708 182 BIRC5/NFKBIA/RAD51/BAX/BRCA2/…
KEGG_PROTEASOME 1.770950 0.6571985 0.0016089 0.0299255 0.4550599 41 PSMD2/PSME2/PSMD3/PSMA4/POMP/…
KEGG_RIBOSOME -1.582053 -0.4392298 0.0044593 0.0754020 0.4070179 84 RPS4X/RPL11/RPL10A/RPL6/RPS5/…

11.1.3.6.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
KEGG_MAPK_SIGNALING_PATHWAY 1.601598 0.4354561 0.0009307 0.0865506 0.4772708 146 JUN/DUSP1/JUND/MYC/FOS/…
KEGG_RIBOSOME -2.013517 -0.5595300 0.0000050 0.0009305 0.6105269 84 RPL41/RPS15A/RPS16/RPL37A/RPL29/…

11.1.3.6.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
KEGG_JAK_STAT_SIGNALING_PATHWAY 2.004276 0.6090932 0.0000042 0.0007840 0.6105269 76 PIM1/SOCS3/IFNGR2/SOCS2/OSM/…
KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 1.902784 0.5630419 0.0000187 0.0017260 0.5756103 94 TNFRSF4/IFNGR2/TNFRSF18/OSM/TNFRSF1A/…
KEGG_LEISHMANIA_INFECTION 1.878891 0.6215761 0.0002953 0.0182118 0.4984931 45 NFKBIA/IFNGR2/FOS/IL10/CYBA/…
KEGG_TYPE_II_DIABETES_MELLITUS 1.921674 0.7509114 0.0005875 0.0271702 0.4772708 20 SOCS3/SOCS2/SOCS1/PRKCZ/PIK3CD
KEGG_COLORECTAL_CANCER 1.817259 0.5895323 0.0012838 0.0395842 0.4550599 50 FOS/BAX/JUN/BIRC5/LEF1/…
KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY 1.784819 0.5870618 0.0011168 0.0395842 0.4550599 47 SOCS3/NFKBIA/TNFRSF1A/TRADD/ADIPOR1/…
KEGG_PARKINSONS_DISEASE 1.597732 0.4682974 0.0025029 0.0661474 0.4317077 104 MT-ND3/MT-ND1/MT-ATP6/MT-CYB/ATP5F1D/…

11.1.3.6.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
KEGG_LEISHMANIA_INFECTION 2.270116 0.7998974 0.0000006 0.0001141 0.6594444 45 FCGR1A/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/…
KEGG_INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION 2.232035 0.8706325 0.0000056 0.0005193 0.6105269 25 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_ALLOGRAFT_REJECTION 2.118426 0.8310802 0.0000346 0.0012349 0.5573322 24 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_ASTHMA 2.115794 0.9220639 0.0000304 0.0012349 0.5756103 14 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_CELL_ADHESION_MOLECULES_CAMS 2.106790 0.7173294 0.0000398 0.0012349 0.5573322 56 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 2.104295 0.7213498 0.0000510 0.0013560 0.5573322 53 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_GRAFT_VERSUS_HOST_DISEASE 2.126954 0.8296444 0.0001305 0.0025492 0.5188481 25 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_VIRAL_MYOCARDITIS 2.123395 0.7556473 0.0001374 0.0025492 0.5188481 42 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_TYPE_I_DIABETES_MELLITUS 2.120448 0.8271065 0.0001508 0.0025492 0.5188481 25 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…
KEGG_AUTOIMMUNE_THYROID_DISEASE 2.107792 0.8550304 0.0002320 0.0035955 0.5188481 20 HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DRA/HLA-DMB/…

11.1.3.6.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
KEGG_RIBOSOME 2.940224 0.7816192 0.0000000 0.0000000 1.1146645 84 RPS3A/RPS15A/RPL4/RPL10/RPL8/…
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 2.411273 0.7243773 0.0000005 0.0000478 0.6594444 41 HLA-DPB1/HLA-DQA1/FCGR1A/C1QB/HLA-DRB1/…
KEGG_LEISHMANIA_INFECTION 2.284199 0.6862026 0.0000045 0.0002737 0.6105269 41 HLA-DPB1/HLA-DQA1/FCGR1A/IFNGR2/HLA-DRB1/…
KEGG_ASTHMA 2.286285 0.8642071 0.0000114 0.0004187 0.5933255 15 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
KEGG_INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION 2.276805 0.7954074 0.0000096 0.0004187 0.5933255 22 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
KEGG_CELL_ADHESION_MOLECULES_CAMS 2.155025 0.6473973 0.0000356 0.0010911 0.5573322 41 HLA-DPB1/HLA-DQA1/VCAN/HLA-DRB1/HLA-DPA1/…
KEGG_AUTOIMMUNE_THYROID_DISEASE 2.164303 0.7859898 0.0001654 0.0043481 0.5188481 18 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
KEGG_ALLOGRAFT_REJECTION 2.088005 0.7454467 0.0005937 0.0121386 0.4772708 21 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
KEGG_TYPE_I_DIABETES_MELLITUS 2.069722 0.7389193 0.0007541 0.0138750 0.4772708 21 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
KEGG_VIRAL_MYOCARDITIS 2.083402 0.6130263 0.0011348 0.0160614 0.4550599 37 HLA-DPB1/HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DRA/…
KEGG_VIBRIO_CHOLERAE_INFECTION -1.536724 -0.6022936 0.0102672 0.0992129 0.3807304 28 TCIRG1/ATP6V1G1/PDIA4/GNAS/ATP6V1C1/…
KEGG_CITRATE_CYCLE_TCA_CYCLE -1.555799 -0.6333197 0.0108629 0.0992129 0.3807304 22 ACO2/SDHD/OGDH/IDH3A/SDHB/…
KEGG_ALZHEIMERS_DISEASE -1.378605 -0.4562842 0.0085468 0.0873675 0.3807304 111 UQCRFS1/SDHD/NDUFA4/UQCR10/CALM1/…
KEGG_EPITHELIAL_CELL_SIGNALING_IN_HELICOBACTER_PYLORI_INFECTION -1.550103 -0.5914938 0.0045306 0.0490367 0.4070179 34 TCIRG1/ATP6V1G1/ATP6V1C1/ATP6V0A1/MAPK14/…
KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION -1.634120 -0.6729056 0.0027635 0.0317803 0.4317077 20 S1PR4/HRH2/NR3C1/CYSLTR1/CHRNB1/…
KEGG_SPLICEOSOME -1.485363 -0.4918235 0.0017076 0.0209465 0.4550599 110 SNW1/CWC15/EIF4A3/TXNL4A/TRA2A/…
KEGG_PROTEASOME -1.626411 -0.6076102 0.0011233 0.0160614 0.4550599 38 PSMB5/PSMB8/PSMA5/PSMA3/PSME1/…

11.1.3.7 TFT_GTRD

11.1.3.7.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.7.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.7.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.7.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
HSD17B8_TARGET_GENES 2.505922 0.7051095 0.0000000 0.0000000 1.6595653 446 TK1/MKI67/UBE2C/CDC20/BIRC5/…
PSMB5_TARGET_GENES 2.085074 0.6145990 0.0000000 0.0000000 0.8513391 228 STMN1/LGALS1/RRM2/TUBA1B/H2AZ1/…
E2F5_TARGET_GENES 1.664771 0.4567061 0.0000000 0.0000000 0.8390889 871 TK1/STMN1/MKI67/BIRC5/TOP2A/…
AEBP2_TARGET_GENES 1.457557 0.4007301 0.0000198 0.0019530 0.5756103 802 TYMS/TK1/UBE2C/CENPM/MCM4/…
BARX1_TARGET_GENES 1.406124 0.3847913 0.0000227 0.0019530 0.5756103 973 UBE2C/CDC20/AREG/TOP2A/CENPF/…
SETD7_TARGET_GENES 1.471061 0.4063160 0.0000316 0.0023282 0.5573322 705 TYMS/STMN1/EIF5A/RRM2/DUT/…
ASH1L_TARGET_GENES 1.366928 0.3731608 0.0000421 0.0027179 0.5573322 1104 TK1/CENPM/TOP2A/CENPF/TPX2/…
ZNF597_TARGET_GENES 1.413176 0.3928013 0.0004363 0.0250162 0.4984931 589 LGALS1/CENPU/MCM7/H4C3/NFKBIA/…
PHF21A_TARGET_GENES 1.583297 0.4778271 0.0012691 0.0654868 0.4550599 178 TK1/RRM2/UHRF1/CDCA7/PTGDS/…
CSHL1_TARGET_GENES -1.861877 -0.4777497 0.0000020 0.0002574 0.6272567 155 DERA/CCDC107/HM13/APEX1/ZNF19/…

11.1.3.7.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GTF2E2_TARGET_GENES 1.483473 0.3719489 0.0009853 0.0792437 0.4550599 285 KLF6/DUSP1/PPP4C/SNHG5/PPP1R15A/…
GTF2A2_TARGET_GENES 1.450753 0.3571803 0.0010750 0.0792437 0.4550599 349 ARPC5L/KLF6/DUSP1/JUND/MYC/…
ELF2_TARGET_GENES -1.226398 -0.2632236 0.0009948 0.0792437 0.4550599 1200 C12orf4/DDX5/ACO2/SOCS4/LSM8/…
ZNF250_TARGET_GENES -1.391894 -0.3235804 0.0009795 0.0792437 0.4550599 387 NDUFS3/DNAJC11/LSM8/STX18/NSL1/…
BARX2_TARGET_GENES -1.233101 -0.2642737 0.0003832 0.0659146 0.4984931 1367 NDUFS3/C12orf4/LIN7C/SLC25A35/RPL41/…
ZSCAN2_TARGET_GENES -1.380313 -0.3098029 0.0002463 0.0659146 0.4984931 573 C12orf4/MRPL45/POLR2K/MBD5/DDX31/…
ZZZ3_TARGET_GENES -1.727348 -0.4849696 0.0003013 0.0659146 0.4984931 89 RPL41/CSNK1D/ZNF655/PSMD1/RPL37A/…

11.1.3.7.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
PSMB5_TARGET_GENES 1.914131 0.5194175 0.0000000 0.0000161 0.7195128 220 SGK1/SNHG5/NFKBIZ/ZFP36/LDHA/…
HMCES_TARGET_GENES 1.890097 0.8917071 0.0000747 0.0192655 0.5384341 9 MT-ND3/MT-CYB/MT-ND4/MT-ND4L/MT-ND6/…
SS18_SSX1_FUSION_UNIPROT_Q8IZH1_UNREVIEWED_TARGET_GENES 1.727403 0.5490251 0.0014488 0.0838067 0.4550599 61 PTGER2/ARID5B/LTBP3/RHOQ/KLF11/…
GTF2A2_TARGET_GENES 1.465602 0.3793365 0.0006167 0.0838067 0.4772708 345 SGK1/SNHG5/FOS/ZFP36/LDHA/…
ZNF282_TARGET_GENES 1.342630 0.3323978 0.0011904 0.0838067 0.4550599 690 PTGER2/SOCS3/IMPDH1/BAX/GUK1/…
ZNF560_TARGET_GENES 1.775012 0.6601199 0.0020491 0.0881121 0.4317077 27 HMGA1/LINC-PINT/OLFM2/CAPN2/KAT8/…
DROSHA_TARGET_GENES 1.736157 0.6032446 0.0017089 0.0881121 0.4550599 37 MT-ND3/MT-ND1/MT-ATP6/MT-ND2/EGR1/…
RARB_TARGET_GENES 1.559662 0.9743756 0.0019330 0.0881121 0.4550599 3 MT-ND3/MT-ND4/MT-ND4L
TFAM_TARGET_GENES 1.696094 0.8910341 0.0024859 0.0986694 0.4317077 6 MT-CYB/MT-ND2/MT-ND6/MT-ND5
NKX2_5_TARGET_GENES -1.249709 -0.2684947 0.0006981 0.0838067 0.4772708 795 HMGCR/DHX8/APPBP2-DT/NRL/STARD4/…
DLX6_TARGET_GENES -1.293392 -0.2890692 0.0013633 0.0838067 0.4550599 428 HMGCR/LINC02453/ABHD11/DHFR/ALKBH1/…
CDC5L_TARGET_GENES -1.529188 -0.3793973 0.0009913 0.0838067 0.4550599 164 HMGCR/NDUFS3/ERAP1/ZNF207/ANKMY1/…
ZNF707_TARGET_GENES -1.829861 -0.6452660 0.0014617 0.0838067 0.4550599 25 BRD1/PRMT5-AS1/COMMD10/HSPA4/ZNF48/…

11.1.3.7.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
PSMB5_TARGET_GENES 2.049872 0.5880093 0.0000000 0.0000108 0.7337620 222 ID1/ADM/SNHG5/H1-2/LDHA/…
ZNF407_TARGET_GENES -1.228450 -0.2708254 0.0004416 0.0568529 0.4984931 1409 MYO1C/FTO/DNAJC1/QSOX1/VPS33A/…
SUPT20H_TARGET_GENES -1.342613 -0.3036060 0.0000066 0.0011374 0.6105269 1033 VPS33A/EIF4A1/TTC31/MIS18A/RPS29/…
DIDO1_TARGET_GENES -1.400445 -0.3117774 0.0000001 0.0000367 0.6901325 1247 AK9/NME7/DNAJC1/TTC31/EMC2/…

11.1.3.7.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
ZNF711_TARGET_GENES -1.276627 -0.3897236 3.44e-04 0.0885925 0.4984931 861 WWP2/ADAP2/NOL11/DIS3L/MPC2/…
SUPT20H_TARGET_GENES -1.359407 -0.4163061 7.60e-06 0.0038997 0.6105269 667 ADAP2/CCNT1/ACO2/ITGAE/PRMT1/…

11.1.3.8 TFT_TFT_Legacy

11.1.3.8.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.8.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.1.3.8.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.1.3.8.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
E2F_Q4 2.235083 0.6796246 0.0000000 0.0000000 0.9545416 188 STMN1/MCM4/CLSPN/RRM2/PCNA/…
E2F_Q3 2.181514 0.6730975 0.0000000 0.0000000 0.8986712 164 STMN1/MCM4/PCNA/H2AZ1/CDC45/…
E2F1DP2_01 2.199273 0.6793092 0.0000000 0.0000000 0.8870750 156 STMN1/MCM4/CLSPN/RRM2/PCNA/…
E2F_Q6_01 2.158422 0.6613600 0.0000000 0.0000000 0.8753251 176 STMN1/EIF5A/MCM4/RRM2/PCNA/…
E2F4DP1_01 2.143873 0.6508513 0.0000000 0.0000000 0.8753251 190 STMN1/MCM4/CLSPN/RRM2/PCNA/…
E2F_Q6 2.143655 0.6516007 0.0000000 0.0000000 0.8753251 184 STMN1/MCM4/CLSPN/RRM2/PCNA/…
E2F1_Q6 2.118278 0.6430303 0.0000000 0.0000000 0.8513391 187 STMN1/MCM4/CLSPN/RRM2/PCNA/…
E2F4DP2_01 2.113440 0.6430380 0.0000000 0.0000000 0.8513391 182 STMN1/MCM4/CLSPN/RRM2/PCNA/…
E2F_03 2.098822 0.6403090 0.0000000 0.0000000 0.8266573 179 STMN1/EIF5A/MCM4/LGALS1/H2AZ1/…
E2F1DP1RB_01 2.127433 0.6617685 0.0000000 0.0000000 0.8140358 145 STMN1/CLSPN/RRM2/PCNA/H2AZ1/…
CCANNAGRKGGC_UNKNOWN -2.004137 -0.6146698 0.0001317 0.0036526 0.5188481 49 RPS4X/SSH2/EXT2/YY1/RPRD1B/…

11.1.3.8.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
ATF3_Q6 1.818764 0.4968675 0.0000227 0.0138582 0.5756103 136 JUN/DUSP1/JUND/AREG/PPP1R15A/…
MEF2_03 1.778060 0.5003649 0.0001783 0.0543947 0.5188481 110 RHOB/JUN/S1PR1/STRADB/FOS/…
TATAAA_TATA_01 1.426909 0.3404044 0.0003982 0.0809728 0.4984931 497 CD69/ZBTB10/EMP3/TSC22D3/MYC/…
MEF2_Q6_01 1.755602 0.5037722 0.0006396 0.0975350 0.4772708 98 RHOB/JUN/STRADB/JCHAIN/CD180/…

11.1.3.8.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
RACCACAR_AML_Q6 1.753159 0.4949765 0.0000882 0.0538283 0.5384341 140 EIF5A/CD69/ID3/RGS1/POLG2/…
AML_Q6 1.724139 0.4872107 0.0002042 0.0622800 0.5188481 136 EIF5A/CD69/RGS1/POLG2/SRSF2/…
STAT5B_01 1.701975 0.4870626 0.0005406 0.0715985 0.4772708 122 NFKBIA/PRDM1/SOCS2/CLDN5/OSM/…
CREL_01 1.680906 0.4729864 0.0004008 0.0715985 0.4984931 143 EIF5A/CD69/NFKBIA/CLDN5/FLOT1/…
HEN1_02 1.658047 0.4810706 0.0010251 0.0715985 0.4550599 105 LRRN3/IMPDH1/UBE2D3/LZTS2/F11R/…
NFKAPPAB65_01 1.657037 0.4736872 0.0008188 0.0715985 0.4772708 127 EIF5A/CD69/NFKBIA/CLDN5/FLOT1/…
AP1_Q4_01 1.646911 0.4653875 0.0009130 0.0715985 0.4772708 136 PRDM1/SRSF2/IL10/NR1D1/REXO2/…
CHOP_01 1.644826 0.4692138 0.0010564 0.0715985 0.4550599 128 NFKBIA/PRDM1/SRSF2/H1-4/KLF12/…
WGTTNNNNNAAA_UNKNOWN 1.494007 0.3912205 0.0008268 0.0715985 0.4772708 270 CD69/PRDM1/SOCS2/FLOT1/FOS/…
HEN1_01 1.690153 0.5090659 0.0015199 0.0927120 0.4550599 85 CLDN5/IMPDH1/UBE2D3/CDK6/LZTS2/…

11.1.3.8.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
LMO2COM_02 1.839400 0.5834059 0.0000978 0.0596486 0.5384341 102 CLU/ADM/ARHGEF10L/ID2/NAA38/…
PAX5_02 1.936611 0.8885264 0.0002801 0.0709771 0.4984931 12 PIM1/RASGRP2/PFN1
TGANTCA_AP1_C 1.488211 0.4006211 0.0003491 0.0709771 0.4984931 555 PIM1/ADM/FAM20A/LGALS1/FKBP5/…

11.2 Covid_Mild_vs_Healthy

11.2.1 Number of significant DEGs across conditions in each cluster

Here we check number of differentially expressed genes (DEGs) after applying some statistical thresholds:

  • Adjusted P value < 0.1

  • Log Fold Change > +/- 0

  • Proportion of cells expressing gene in condition A > 0.1 if LFC is positive

  • Proportion of cells expressing gene in condition B > 0.1 if LFC is negative

Differential expression compares genes in condition A (left) versus condition B (right). The left column indicates the number of genes upregulated in the left condition, while the right column indicates genes upregulated in the right condition.

Finally, the thresholds above do not affect downstream results from pathway analysis, they are just meant to count the number of DEGs. Downstream usage of DEGs can use these thresholds, or you can choose other appropriate cutoffs.

Cluster Covid_Mild_high Healthy_high
cluster_1 4 2
cluster_2 10 6
cluster_3 12 12
cluster_4 32 4
cluster_5 23 2
cluster_6 14 1
cluster_7 0 0

11.2.2 Per-cluster heatmaps of all DEGs

To ensure the differential expression results are robust, it is helpful to inspect all DEGs by visualizing them in a heatmap. For each cluster, we compare the DEGs at both the single cell level, and either the pseudobulk level (if we used pseudobulk_edgeR) or the average RISC value level (if using wilcox).

11.2.2.1 Cluster_1

11.2.2.1.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.1.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.2 Cluster_2

11.2.2.2.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.2.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.3 Cluster_3

11.2.2.3.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.3.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.4 Cluster_4

11.2.2.4.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.4.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.5 Cluster_5

11.2.2.5.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.5.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.6 Cluster_6

11.2.2.6.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.2.6.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.2.3 Gene set enrichment analysis across conditions

Pathway analysis for the cross-condition analysis is performed for overexpressed and underexpressed genes for each cluster. This is done via Gene Set Enrichment Analysis (GSEA) (Subramanian et al 2005).

GSEA is preferred over other pathway analysis such as fisher tests or chi-square tests because it does not require making arbitrary cutoffs to the number of DEGs and takes into account how strongly differentially expressed each gene may be. For the latter, data-driven gene-specific weight is applied. We use a standard weighting method of -log10(P-value) * sign of Log Fold Change.

The pathways we choose in pathway analysis are derived from the Molecular Signatures Database (MSIGDB) where they are sorted by categories, such as Gene Ontology (GO) Biological Process, GO Molecular Function, KEGG, Reactome, etc. These are databases that annotate genes by function or molecular pathway.

11.2.3.1 HALLMARK

11.2.3.1.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.1.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.1.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.1.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_ALPHA_RESPONSE 1.860675 0.5225121 0.0034503 0.0575053 0.4317077 89 EIF2AK2/CMPK2/IFI44L/EPSTI1/OAS1/…
HALLMARK_INFLAMMATORY_RESPONSE 1.741909 0.4831213 0.0107553 0.0896279 0.3807304 104 NFKBIA/EIF2AK2/IL7R/PTGER4/CYBB/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.670735 0.4333155 0.0095972 0.0896279 0.3807304 167 NFKBIA/EIF2AK2/IRF8/CMPK2/IFI44L/…
HALLMARK_DNA_REPAIR -1.470810 -0.3853608 0.0069397 0.0867465 0.4070179 138 GMPR2/ERCC8/VPS37B/REV3L/NUDT21/…
HALLMARK_MYC_TARGETS_V1 -1.537450 -0.3866300 0.0014790 0.0415748 0.4550599 189 KPNB1/NCBP2/COX5A/PSMD14/SNRPA1/…
HALLMARK_OXIDATIVE_PHOSPHORYLATION -1.537718 -0.3865418 0.0016630 0.0415748 0.4550599 190 COX11/COX7A2L/OGDH/ATP5MC3/COX5A/…

11.2.3.1.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.648761 0.4191647 0.0019 0.0950022 0.4550599 162 CD69/WARS1/IFI44L/SSPN/PFKP/…

11.2.3.1.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.836235 0.4509373 0.0000282 0.0014096 0.5756103 159 CD69/ARID5B/EIF2AK2/PDE4B/TNFAIP3/…
HALLMARK_INTERFERON_ALPHA_RESPONSE 1.779867 0.4827466 0.0005735 0.0095586 0.4772708 86 EIF2AK2/IFITM3/OASL/LY6E/MX1/…
HALLMARK_INFLAMMATORY_RESPONSE 1.618972 0.4276528 0.0039414 0.0394139 0.4070179 97 CD69/CCR7/EIF2AK2/PDE4B/PTGER2/…
HALLMARK_OXIDATIVE_PHOSPHORYLATION -1.483782 -0.3894638 0.0037731 0.0394139 0.4317077 188 MRPL35/SDHB/UQCR11/MTRR/ATP5F1B/…
HALLMARK_MYC_TARGETS_V1 -1.651533 -0.4341729 0.0001322 0.0033056 0.5188481 186 NCBP2/CCT2/RAD23B/CAD/CCT4/…

11.2.3.1.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_GAMMA_RESPONSE 2.219077 0.5766389 0.0000000 0.0000002 0.7614608 171 FCGR1A/IFI27/IFITM2/NFKBIA/MT2A/…
HALLMARK_INTERFERON_ALPHA_RESPONSE 2.024761 0.5815479 0.0000192 0.0003208 0.5756103 89 IFI27/IFITM2/IFITM3/IFITM1/OASL/…
HALLMARK_TNFA_SIGNALING_VIA_NFKB 1.949830 0.5144732 0.0000155 0.0003208 0.5756103 147 MAP3K8/GADD45B/ID2/BCL2A1/NFKBIA/…
HALLMARK_COMPLEMENT 1.842105 0.4912575 0.0000766 0.0009571 0.5384341 136 C1QC/CLU/C1QA/PIM1/GZMA/…
HALLMARK_KRAS_SIGNALING_DN 1.882308 0.6238281 0.0006570 0.0065702 0.4772708 40 IFI44L/RSAD2/CAMK1D/SLC16A7/LFNG/…
HALLMARK_INFLAMMATORY_RESPONSE 1.781600 0.4835323 0.0008720 0.0072671 0.4772708 122 NFKBIA/ADM/IFITM1/ABCA1/CD40/…
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.772822 0.5401254 0.0012207 0.0076297 0.4550599 63 GADD45B/ID2/SGCB/CXCL8/BMP1/…
HALLMARK_MYOGENESIS 1.682561 0.4971966 0.0025593 0.0127963 0.4317077 73 CLU/GADD45B/TNNT1/BHLHE40/CDKN1A/…
HALLMARK_COAGULATION 1.705623 0.5436959 0.0030711 0.0139595 0.4317077 51 CLU/C1QA/SERPING1/PF4/PRSS23/…
HALLMARK_TGF_BETA_SIGNALING 1.701278 0.5606248 0.0037771 0.0157378 0.4317077 41 ID1/ID2/SMAD7/KLF10/FNTA
HALLMARK_OXIDATIVE_PHOSPHORYLATION -1.438241 -0.3508565 0.0054276 0.0208754 0.4070179 192 AIFM1/DLST/CYC1/TIMM50/NDUFB7/…
HALLMARK_DNA_REPAIR -1.598664 -0.4080443 0.0018895 0.0104973 0.4550599 135 AAAS/HPRT1/USP11/TSG101/ITPA/…

11.2.3.1.3.5 cluster_5
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_GAMMA_RESPONSE 2.460772 0.6327149 0 0 0.8870750 150 IFI27/IFI44L/IFIT1/ISG15/MX1/…
HALLMARK_INTERFERON_ALPHA_RESPONSE 2.618870 0.7359381 0 0 0.8390889 81 IFI27/IFI44L/IFITM1/ISG15/MX1/…

11.2.3.1.3.6 cluster_6
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_ALPHA_RESPONSE 2.640576 0.7428698 0e+00 0.0e+00 0.8513391 74 IFI27/IFITM1/IFITM3/LY6E/MX1/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 2.261363 0.5835799 1e-07 1.6e-06 0.7049757 137 IFI27/IFITM3/XAF1/LY6E/MX1/…

11.2.3.2 GO_BP

11.2.3.2.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.2.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.2.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.2.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 2.369033 0.5602643 0.0000000 0.0000000 1.1512205 415 TRAV38-2DV8/TRAV8-4/TRAV26-2/TRAV12-2/TRBV20-1/…
GOBP_IMMUNE_RESPONSE 1.871305 0.4080207 0.0000000 0.0000000 1.0672100 1098 TRAV38-2DV8/TRAV8-4/TRAV26-2/FASLG/NFKBIA/…
GOBP_BIOLOGICAL_PROCESS_INVOLVED_IN_INTERSPECIES_INTERACTION_BETWEEN_ORGANISMS 1.566107 0.3463131 0.0000004 0.0006192 0.6749629 908 TRAV26-2/FASLG/NFKBIA/KIR3DL1/TRAV4/…
GOBP_RESPONSE_TO_BACTERIUM 1.865006 0.4481722 0.0000011 0.0014159 0.6435518 343 TRAV26-2/FASLG/NFKBIA/IGHV3-74/IGHV3-73/…
GOBP_DEFENSE_RESPONSE 1.481718 0.3278106 0.0000349 0.0290092 0.5573322 873 FASLG/NFKBIA/KIR3DL1/IGHV3-74/EIF2AK2/…
GOBP_DEFENSE_RESPONSE_TO_OTHER_ORGANISM 1.532996 0.3488651 0.0000419 0.0313079 0.5573322 634 FASLG/KIR3DL1/IGHV3-74/EIF2AK2/IGHV3-73/…
GOBP_INNATE_IMMUNE_RESPONSE 1.552066 0.3585867 0.0001527 0.0877615 0.5188481 516 FASLG/KIR3DL1/IGHV3-74/EIF2AK2/IGHV3-73/…
GOBP_CELLULAR_AMIDE_METABOLIC_PROCESS -1.403867 -0.3110330 0.0000874 0.0543959 0.5384341 809 RPL13/MARS2/BTG2/METTL3/NEMF/…
GOBP_CELLULAR_MACROMOLECULE_BIOSYNTHETIC_PROCESS -1.345345 -0.2955738 0.0000728 0.0494436 0.5384341 1030 RPL13/BTG2/METTL3/NEMF/B4GALT4/…
GOBP_AMIDE_BIOSYNTHETIC_PROCESS -1.536444 -0.3463365 0.0000066 0.0061439 0.6105269 637 RPL13/MARS2/BTG2/METTL3/NEMF/…
GOBP_PEPTIDE_METABOLIC_PROCESS -1.555445 -0.3506830 0.0000013 0.0014248 0.6435518 650 RPL13/MARS2/BTG2/METTL3/NEMF/…
GOBP_PEPTIDE_BIOSYNTHETIC_PROCESS -1.637163 -0.3760544 0.0000004 0.0006192 0.6749629 523 RPL13/MARS2/BTG2/METTL3/NEMF/…
GOBP_CYTOPLASMIC_TRANSLATION -2.097907 -0.5573015 0.0000000 0.0000570 0.7337620 134 RPL13/METTL3/NEMF/YTHDF2/RPL37A/…

11.2.3.2.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 2.428342 0.5655280 0.0000000 0.0000000 1.1330899 362 IGHV5-10-1/IGKV1D-17/IGHV4-4/IGLV4-3/IGKV2-24/…
GOBP_IMMUNOGLOBULIN_PRODUCTION 2.349868 0.6032476 0.0000000 0.0000016 0.8012156 161 IGKV1D-17/IGLV4-3/IGKV2-24/IGLV5-45/IGLV5-37/…
GOBP_MEMBRANE_INVAGINATION 2.476607 0.6786930 0.0000000 0.0000023 0.7749390 101 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_POSITIVE_REGULATION_OF_B_CELL_ACTIVATION 2.417318 0.6643847 0.0000000 0.0000023 0.7749390 105 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHG2/IGHV1-69-2/…
GOBP_PHAGOCYTOSIS_RECOGNITION 2.542065 0.7466718 0.0000000 0.0000027 0.7614608 65 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_COMPLEMENT_ACTIVATION 2.527229 0.7339930 0.0000000 0.0000034 0.7614608 70 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_CELL_RECOGNITION 2.380411 0.6502955 0.0000000 0.0000034 0.7614608 108 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_HUMORAL_IMMUNE_RESPONSE 2.346764 0.6201149 0.0000000 0.0000034 0.7614608 133 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_B_CELL_RECEPTOR_SIGNALING_PATHWAY 2.388981 0.6525518 0.0000000 0.0000037 0.7477397 103 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_HUMORAL_IMMUNE_RESPONSE_MEDIATED_BY_CIRCULATING_IMMUNOGLOBULIN 2.449157 0.7048141 0.0000000 0.0000097 0.7195128 74 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOBP_AMIDE_BIOSYNTHETIC_PROCESS -1.376894 -0.3216217 0.0004876 0.0697338 0.4984931 633 NT5C3B/RPS14/CNOT8/BZW2/RBM4/…
GOBP_RIBOSOME_BIOGENESIS -1.546860 -0.3983614 0.0004471 0.0651945 0.4984931 211 MRM1/RPS14/FTSJ3/POP5/RPS25/…
GOBP_PEPTIDE_BIOSYNTHETIC_PROCESS -1.425108 -0.3374772 0.0004322 0.0642825 0.4984931 525 RPS14/CNOT8/BZW2/RBM4/PAIP2B/…
GOBP_NCRNA_PROCESSING -1.528356 -0.3751245 0.0004294 0.0642825 0.4984931 323 MRM1/RPS14/FTSJ3/MRTO4/TRMT5/…
GOBP_NUCLEAR_TRANSCRIBED_MRNA_CATABOLIC_PROCESS -1.734358 -0.4821648 0.0003501 0.0553943 0.4984931 106 NT5C3B/CNOT8/MRTO4/NBDY/CNOT7/…
GOBP_NCRNA_METABOLIC_PROCESS -1.449554 -0.3477894 0.0002790 0.0450951 0.4984931 435 MRM1/RPS14/FTSJ3/MRTO4/TRMT5/…
GOBP_RRNA_METABOLIC_PROCESS -1.627560 -0.4115841 0.0001403 0.0237122 0.5188481 239 MRM1/RPS14/FTSJ3/MRTO4/KRI1/…
GOBP_PEPTIDE_METABOLIC_PROCESS -1.431015 -0.3341687 0.0000556 0.0096124 0.5573322 650 NT5C3B/RPS14/CNOT8/BZW2/RBM4/…
GOBP_RNA_PROCESSING -1.464733 -0.3379647 0.0000113 0.0020058 0.5933255 783 MRM1/RPS14/FTSJ3/RBM4/MRTO4/…
GOBP_CYTOPLASMIC_TRANSLATION -1.925943 -0.5184987 0.0000023 0.0004238 0.6272567 136 RPS14/RBM4/RPL8/RPL7A/RPS25/…

11.2.3.2.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 1.974708 0.4436898 0.0000000 0.0000001 0.8870750 416 TRBV18/TRBV5-5/TRBV10-1/TRAV27/TRBV6-6/…
GOBP_PROTEIN_DNA_COMPLEX_ASSEMBLY 2.110473 0.5362001 0.0000008 0.0019645 0.6594444 130 H1-4/H1-2/H3C2/H3C7/H3C3/…
GOBP_NUCLEOSOME_ASSEMBLY 2.168932 0.6008862 0.0000027 0.0049671 0.6272567 74 H1-4/H1-2/H3C2/H3C7/H3C3/…
GOBP_DNA_PACKAGING 1.901859 0.4674458 0.0000056 0.0083538 0.6105269 162 H1-4/H1-2/H3C2/H3C10/AKAP8L/…
GOBP_DEFENSE_RESPONSE_TO_BACTERIUM 1.889156 0.4692451 0.0000202 0.0187732 0.5756103 148 TRAV27/GSDMA/IGHV3-74/TNFRSF1A/IGKV3-20/…
GOBP_CHROMOSOME_CONDENSATION 2.216715 0.7133027 0.0000310 0.0242270 0.5573322 34 H1-4/H1-2/AKAP8L/H1-3/CDK1/…
GOBP_COMPLEMENT_ACTIVATION 2.112851 0.5985522 0.0000326 0.0242270 0.5573322 61 CLU/IGHV3-74/IGHV5-10-1/IGHV2-26/IGHV1-18/…
GOBP_REGULATION_OF_VIRAL_GENOME_REPLICATION 2.069228 0.5804109 0.0000477 0.0322515 0.5573322 67 EIF2AK2/IFITM3/OASL/MX1/OAS2/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_GENOME_REPLICATION 2.131175 0.6471560 0.0000584 0.0354966 0.5573322 45 EIF2AK2/IFITM3/OASL/MX1/OAS2/…
GOBP_PHAGOCYTOSIS_RECOGNITION 2.026473 0.5935845 0.0001426 0.0623784 0.5188481 52 IGHV3-74/IGHV5-10-1/IGHV2-26/IGHV1-18/IGHV3-73/…
GOBP_CYTOSOLIC_TRANSPORT -1.736974 -0.4689094 0.0001922 0.0752298 0.5188481 133 USP7/TBC1D10C/DOP1A/ARFRP1/SNX5/…
GOBP_RIBOSOME_ASSEMBLY -1.860932 -0.5904210 0.0001876 0.0752298 0.5188481 50 XRCC5/DDX28/SBDS/RPL10/MTERF3/…

11.2.3.2.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOBP_COMPLEMENT_ACTIVATION 2.431221 0.7407868 0.0000000 0.0000135 0.7477397 58 C1QC/C1QB/CLU/C1QA/SERPING1/…
GOBP_NEGATIVE_REGULATION_OF_HEMOPOIESIS 2.362874 0.7243888 0.0000001 0.0001007 0.7049757 56 C1QC/ID2/SMAD7/RC3H2/BCL6/…
GOBP_CELL_JUNCTION_DISASSEMBLY 2.152596 0.9558911 0.0000001 0.0001012 0.6901325 10 C1QC/C1QB/C1QA
GOBP_HUMORAL_IMMUNE_RESPONSE_MEDIATED_BY_CIRCULATING_IMMUNOGLOBULIN 2.283553 0.6942718 0.0000008 0.0004842 0.6594444 59 C1QC/C1QB/CLU/C1QA/FCGR2B/…
GOBP_HUMORAL_IMMUNE_RESPONSE 2.104768 0.5693051 0.0000020 0.0008855 0.6272567 124 C1QC/C1QB/CLU/C1QA/RNASE2/…
GOBP_REGULATION_OF_VIRAL_GENOME_REPLICATION 2.257905 0.6685044 0.0000027 0.0010720 0.6272567 67 IFITM2/IFITM3/IFITM1/OASL/RSAD2/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_GENOME_REPLICATION 2.195126 0.7070095 0.0000117 0.0038146 0.5933255 44 IFITM2/IFITM3/IFITM1/OASL/APOBEC3A/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_PROCESS 2.093790 0.6222968 0.0000400 0.0093507 0.5573322 66 IFITM2/IFITM3/IFITM1/OASL/RSAD2/…
GOBP_RESPONSE_TO_TYPE_I_INTERFERON 2.113700 0.6548269 0.0001241 0.0238255 0.5188481 53 IFI27/IFITM2/IFITM3/IFITM1/ISG15/…
GOBP_RESPONSE_TO_INTERFERON_BETA 2.109431 0.7679136 0.0006536 0.0873725 0.4772708 24 IFITM2/IFITM3/IFITM1/AIM2/OAS1/…
GOBP_MRNA_CIS_SPLICING_VIA_SPLICEOSOME -1.905173 -0.7051960 0.0007271 0.0938497 0.4772708 21 WBP11/WBP4/RBM17/CWC15/PRPF40A/…
GOBP_CELL_CYCLE_PROCESS -1.318557 -0.2863453 0.0005959 0.0841725 0.4772708 731 CCNY/AAAS/PPP2R2D/RAD51B/SDCCAG8/…
GOBP_MRNA_METABOLIC_PROCESS -1.354830 -0.2997693 0.0003729 0.0547292 0.4984931 566 WBP11/MOV10/DCP1B/TBRG4/RNPC3/…
GOBP_MRNA_PROCESSING -1.428778 -0.3253080 0.0002580 0.0410893 0.4984931 402 WBP11/WBP4/TBRG4/RNPC3/NBDY/…
GOBP_VERY_LONG_CHAIN_FATTY_ACID_METABOLIC_PROCESS -1.962502 -0.8056394 0.0001569 0.0279641 0.5188481 14 ABCD4/HACD2/ABCD3/TECR/HSD17B4
GOBP_NUCLEOSIDE_BISPHOSPHATE_METABOLIC_PROCESS -1.822378 -0.5053500 0.0001314 0.0245887 0.5188481 79 DLST/HACD2/PDHA1/PAPSS1/TECR/…
GOBP_CELL_DIVISION -1.527260 -0.3476624 0.0000303 0.0075616 0.5756103 378 CCNY/PPP2R2D/AHCTF1/NSMCE2/HAUS1/…

11.2.3.2.3.5 cluster_5
pathway NES ES pval padj log2err size leadingEdge
GOBP_DEFENSE_RESPONSE_TO_SYMBIONT 2.141260 0.5455479 0.0000000 0.0000935 0.7477397 176 IFI27/APOBEC3A/IFIT1/IFITM1/IFI6/…
GOBP_RESPONSE_TO_VIRUS 1.930244 0.4736786 0.0000002 0.0007369 0.6901325 232 IFI27/IFI44L/APOBEC3A/IFIT1/IFITM1/…
GOBP_INNATE_IMMUNE_RESPONSE 1.697166 0.3933076 0.0000003 0.0007756 0.6749629 391 IFI27/C1QB/APOBEC3A/IFIT1/IFITM1/…
GOBP_RESPONSE_TO_TYPE_I_INTERFERON 2.519005 0.7725815 0.0000008 0.0014283 0.6594444 45 IFI27/IFIT1/IFITM1/ISG15/MX1/…
GOBP_DEFENSE_RESPONSE_TO_OTHER_ORGANISM 1.548131 0.3589170 0.0000015 0.0022037 0.6435518 489 IFI27/C1QB/IFI44L/APOBEC3A/IFIT1/…
GOBP_VIRAL_GENOME_REPLICATION 2.268702 0.6151272 0.0000043 0.0052296 0.6105269 90 IFI27/APOBEC3A/IFIT1/IFITM1/ISG15/…
GOBP_VIRAL_LIFE_CYCLE 1.882133 0.4747575 0.0000459 0.0371390 0.5573322 179 IFI27/APOBEC3A/IFIT1/IFITM1/ISG15/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_PROCESS 2.200760 0.6426592 0.0000885 0.0586393 0.5384341 61 APOBEC3A/IFIT1/IFITM1/ISG15/MX1/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_GENOME_REPLICATION 2.161283 0.6687871 0.0001194 0.0724794 0.5384341 42 APOBEC3A/IFIT1/IFITM1/ISG15/MX1/…
GOBP_SYNAPSE_PRUNING 1.983686 0.9596783 0.0001393 0.0780763 0.5188481 6 C1QB/C3

11.2.3.2.3.6 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOBP_RESPONSE_TO_TYPE_I_INTERFERON 2.538164 0.7676580 0.0000001 0.0000930 0.7049757 46 IFI27/IFITM1/IFITM3/MX1/IFITM2/…
GOBP_DEFENSE_RESPONSE_TO_SYMBIONT 2.163648 0.5400973 0.0000009 0.0007951 0.6594444 163 IFI27/IFI6/IFITM1/IFITM3/DDIT4/…
GOBP_RESPONSE_TO_INTERFERON_BETA 2.442482 0.8649229 0.0000011 0.0008919 0.6435518 21 IFITM1/IFITM3/XAF1/IFITM2/CAPN2
GOBP_VIRAL_GENOME_REPLICATION 2.330376 0.6357810 0.0000012 0.0008919 0.6435518 87 IFI27/IFITM1/IFITM3/MX1/ISG20/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_ENTRY_INTO_HOST_CELL 2.160108 0.9026435 0.0000080 0.0052642 0.5933255 11 IFITM1/IFITM3/LY6E/IFITM2/FCN1/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_GENOME_REPLICATION 2.319381 0.7187952 0.0000817 0.0455892 0.5384341 38 IFITM1/IFITM3/MX1/ISG20/IFITM2/…
GOBP_MODULATION_BY_HOST_OF_VIRAL_GENOME_REPLICATION 2.141035 0.8545904 0.0000958 0.0496338 0.5384341 13 IFI27/STOM
GOBP_NEGATIVE_REGULATION_OF_VIRAL_LIFE_CYCLE 2.259694 0.8916490 0.0001154 0.0558291 0.5384341 14 IFITM1/IFITM3/LY6E/IFITM2/FCN1/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_PROCESS 2.341837 0.6833816 0.0001648 0.0612277 0.5188481 55 IFITM1/IFITM3/LY6E/MX1/ISG20/…
GOBP_REGULATION_OF_BIOLOGICAL_PROCESS_INVOLVED_IN_SYMBIOTIC_INTERACTION 2.324339 0.7864799 0.0003155 0.0995452 0.4984931 26 IFITM1/IFITM3/LY6E/IFITM2/FCN1/…

11.2.3.3 GO_MF

11.2.3.3.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.3.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.3.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.3.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOMF_ANTIGEN_BINDING 2.117260 0.5683886 8.39e-05 0.0351880 0.5384341 123 TRAV8-4/TRAV12-2/IGHV3-74/IGHV3-73/IL7R/…
GOMF_SIGNALING_RECEPTOR_BINDING 1.541810 0.3528771 7.53e-05 0.0351880 0.5384341 601 S100B/TRAV8-4/FASLG/IGHV3-74/IGHV3-73/…
GOMF_LIGASE_ACTIVITY -1.808788 -0.4785819 6.30e-05 0.0351880 0.5384341 132 PCCA/MARS2/NADSYN1/SAE1/AARS1/…
GOMF_STRUCTURAL_CONSTITUENT_OF_RIBOSOME -1.892482 -0.4919282 4.50e-06 0.0076226 0.6105269 148 RPL13/MRPL41/RPL37A/RPL10A/RPL4/…

11.2.3.3.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOMF_ANTIGEN_BINDING 2.426503 0.6422670 0.00e+00 0.0000011 0.8012156 126 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOMF_IMMUNOGLOBULIN_RECEPTOR_BINDING 2.523315 0.7445718 1.00e-07 0.0000583 0.7049757 61 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHV1-69-2/…
GOMF_SIGNALING_RECEPTOR_BINDING 1.489865 0.3340615 2.67e-05 0.0088396 0.5756103 570 IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/IGHV1-69-2/…
GOMF_RNA_BINDING -1.362048 -0.3039544 2.28e-05 0.0088396 0.5756103 1389 MRM1/N4BP1/RPS14/CNOT8/FTSJ3/…
GOMF_STRUCTURAL_CONSTITUENT_OF_RIBOSOME -1.902452 -0.5004558 2.60e-06 0.0014351 0.6272567 149 RPS14/RPL8/RPL7A/RPS25/MRPS35/…

11.2.3.3.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOMF_ANTIGEN_BINDING 1.967821 0.5092179 9.5e-06 0.015686 0.5933255 119 IGLV3-21/IGKV2-30/IGHV3-74/IGKV3-20/HLA-DRB5/…

11.2.3.3.3.4 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOMF_LAMIN_BINDING 2.070413 0.9126548 6.71e-05 0.057805 0.5384341 10 IFI27/SUN1
GOMF_ANTIGEN_BINDING 2.062157 0.6127616 7.26e-05 0.057805 0.5384341 55 IGLV1-40/IGHV1-69D/CD1C/IGLV3-21/IGKV3-20/…

11.2.3.4 GO_CC

11.2.3.4.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.4.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.4.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.4.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOCC_T_CELL_RECEPTOR_COMPLEX 2.810769 0.8135368 0.0000000 0.0000000 1.0175448 79 TRAV38-2DV8/TRAV8-4/TRAV26-2/TRAV12-2/TRBV20-1/…
GOCC_PLASMA_MEMBRANE_SIGNALING_RECEPTOR_COMPLEX 2.480113 0.6597571 0.0000000 0.0000000 0.9101197 144 TRAV38-2DV8/TRAV8-4/TRAV26-2/TRAV12-2/TRBV20-1/…
GOCC_RECEPTOR_COMPLEX 2.372804 0.5944980 0.0000000 0.0000000 0.8986712 214 TRAV38-2DV8/TRAV8-4/TRAV26-2/TRAV12-2/TRBV20-1/…
GOCC_PLASMA_MEMBRANE_PROTEIN_COMPLEX 2.187653 0.5304449 0.0000000 0.0000000 0.8266573 280 TRAV38-2DV8/TRAV8-4/TRAV26-2/TRAV12-2/TRBV20-1/…
GOCC_SOMATODENDRITIC_COMPARTMENT 1.741551 0.4175717 0.0001192 0.0147172 0.5384341 315 S100B/CYBB/SLA/STAT1/NCDN/…
GOCC_MEMBRANE_PROTEIN_COMPLEX 1.436362 0.3236719 0.0002872 0.0236458 0.4984931 739 TRAV38-2DV8/TRAV8-4/TRAV26-2/TRAV12-2/TRBV20-1/…
GOCC_CELL_BODY 1.764825 0.4429838 0.0007927 0.0602450 0.4772708 212 S100B/CYBB/NCDN/TPX2/TOP1/…
GOCC_MHC_CLASS_II_PROTEIN_COMPLEX 1.834732 0.7393508 0.0013192 0.0930986 0.4550599 15 HLA-DQB2/HLA-DRB5/HLA-DOB/HLA-DRB1/HLA-DPA1/…
GOCC_CYTOSOLIC_RIBOSOME -1.783070 -0.4942858 0.0002826 0.0236458 0.4984931 91 RPL37A/RPS27/RPL10A/RPL4/RPL27A/…
GOCC_INTRACELLULAR_PROTEIN_CONTAINING_COMPLEX -1.425841 -0.3218254 0.0001487 0.0157025 0.5188481 595 PPP2R1B/PSMB10/CSNK2B/PPP4R2/FBXO8/…
GOCC_RIBOSOMAL_SUBUNIT -1.632796 -0.4211812 0.0001589 0.0157025 0.5188481 161 RPL13/MRPL41/RPL37A/RPS27/RPL10A/…
GOCC_CATALYTIC_COMPLEX -1.335781 -0.2921304 0.0000979 0.0138217 0.5384341 1165 GMPR2/PPP2R1B/RCOR3/POP7/PSMB10/…
GOCC_LARGE_RIBOSOMAL_SUBUNIT -1.880783 -0.5119930 0.0000398 0.0065560 0.5573322 103 RPL13/MRPL41/RPL10A/RPL4/MRPL55/…
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT -2.227967 -0.6842072 0.0000005 0.0000913 0.6749629 50 RPL13/RPL37A/RPL10A/RPL4/RPL27A/…

11.2.3.4.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOCC_IMMUNOGLOBULIN_COMPLEX 2.877338 0.7668488 0.0000000 0.0000000 1.1512205 124 IGHV5-10-1/IGKV1D-17/IGHV4-4/IGLV4-3/IGKV2-24/…
GOCC_IMMUNOGLOBULIN_COMPLEX_CIRCULATING 2.501185 0.7606068 0.0000000 0.0000002 0.8140358 60 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE 2.120924 0.5267277 0.0000000 0.0000056 0.7337620 209 IGHV5-10-1/IGHV4-4/CD69/IGHV3-20/IGHV1-24/…
GOCC_CELL_SURFACE 1.841232 0.4294251 0.0000002 0.0000448 0.6901325 355 IGHV5-10-1/IGHV4-4/CD69/IGHV3-20/IGHV1-24/…
GOCC_SIDE_OF_MEMBRANE 1.819743 0.4293807 0.0000008 0.0001334 0.6594444 333 IGHV5-10-1/IGHV4-4/IGHV3-20/IGHV1-24/IGHG2/…
GOCC_CYTOCHROME_COMPLEX -1.875807 -0.6551574 0.0003830 0.0189385 0.4984931 29 COX6C/COX7B/NDUFA4/UQCR11/COX7C/…
GOCC_RESPIRATORY_CHAIN_COMPLEX_IV -1.910721 -0.7750233 0.0002386 0.0124183 0.5188481 15 COX6C/COX7B/NDUFA4/COX7C/COX5A/…
GOCC_CYTOSOLIC_SMALL_RIBOSOMAL_SUBUNIT -1.883362 -0.6242331 0.0001859 0.0102117 0.5188481 39 RPS14/RPS25/RPS17/RPS28/RPS27A/…
GOCC_POLYSOMAL_RIBOSOME -1.985078 -0.6933221 0.0000647 0.0045713 0.5384341 29 RPL8/RPL7A/RPL18A/RPS28/RPL39/…
GOCC_LARGE_RIBOSOMAL_SUBUNIT -1.843497 -0.5156260 0.0000243 0.0019993 0.5756103 103 RPL8/RPL7A/MRPL52/RPL35A/MRPL44/…
GOCC_RESPIRASOME -1.921574 -0.5485906 0.0000181 0.0017926 0.5756103 89 COX6C/COX7B/NDUFA4/UQCR11/HIGD2A/…
GOCC_POLYSOME -2.060628 -0.6289142 0.0000071 0.0007761 0.6105269 58 MCRS1/RPL8/RPL7A/AGO1/RPL18A/…
GOCC_RIBOSOMAL_SUBUNIT -1.837359 -0.4836669 0.0000060 0.0007453 0.6105269 160 RPS14/RPL8/RPL7A/RPS25/MRPS28/…
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT -2.091839 -0.6563991 0.0000018 0.0002555 0.6435518 50 RPL8/RPL7A/RPL35A/RPL18A/RPL14/…
GOCC_CYTOSOLIC_RIBOSOME -2.145407 -0.6109694 0.0000000 0.0000056 0.7337620 91 RPS14/RPL8/RPL7A/RPS25/RPL35A/…

11.2.3.4.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOCC_IMMUNOGLOBULIN_COMPLEX 2.160772 0.5807815 0.0000011 0.0009178 0.6435518 92 IGLV3-21/IGKV2-30/IGHV3-74/IGKV3-20/IGLV9-49/…
GOCC_T_CELL_RECEPTOR_COMPLEX 2.082930 0.5608558 0.0000019 0.0009178 0.6435518 95 TRBV18/TRBV5-5/TRBV10-1/TRBV6-6/TRAV14DV4/…
GOCC_DNA_PACKAGING_COMPLEX 2.092896 0.5757801 0.0000074 0.0024103 0.6105269 80 H1-4/H1-2/H3C2/H3C10/H3C7/…
GOCC_PROTEIN_DNA_COMPLEX 1.858029 0.4773662 0.0000101 0.0024698 0.5933255 135 H1-4/H1-2/H3C2/H3C10/H3C7/…
GOCC_PLASMA_MEMBRANE_PROTEIN_COMPLEX 1.647734 0.3837028 0.0000323 0.0052788 0.5573322 283 TRBV18/TRBV5-5/TRBV10-1/TRAV27/TRBV6-6/…
GOCC_SPHERICAL_HIGH_DENSITY_LIPOPROTEIN_PARTICLE 1.624244 0.9913244 0.0000275 0.0052788 0.5756103 3 CLU/APOM
GOCC_PLASMA_MEMBRANE_SIGNALING_RECEPTOR_COMPLEX 1.820071 0.4537437 0.0000505 0.0070777 0.5573322 154 TRBV18/TRBV5-5/TRBV10-1/TRAV27/TRBV6-6/…
GOCC_RECEPTOR_COMPLEX 1.587443 0.3831787 0.0001581 0.0193832 0.5188481 213 TRBV18/TRBV5-5/TRBV10-1/TRAV27/TRBV6-6/…
GOCC_IMMUNOGLOBULIN_COMPLEX_CIRCULATING 1.939151 0.5819781 0.0002973 0.0291684 0.4984931 48 IGHV3-74/IGKV3-20/IGHV5-10-1/IGHV2-26/IGHV1-18/…
GOCC_CALCIUM_CHANNEL_COMPLEX 2.013736 0.7540111 0.0005838 0.0520672 0.4772708 17 PDE4D/PDE4B/MCUB/MICU1/PTPA/…
GOCC_AUTOPHAGOSOME_MEMBRANE -1.785206 -0.5962987 0.0009338 0.0763369 0.4772708 39 ATP6AP2/ATG9A/RAB7A/TECPR1/ATG14/…
GOCC_MITOCHONDRIAL_PROTEIN_CONTAINING_COMPLEX -1.574719 -0.3988374 0.0002286 0.0249173 0.5188481 250 MRPL35/SDHB/IMMP1L/UQCR11/MRPL38/…

11.2.3.4.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOCC_BLOOD_MICROPARTICLE 2.293026 0.7360336 0.0000003 0.0001473 0.6749629 52 C1QC/C1QB/CLU/IGLV3-21/HSPA1B/…
GOCC_EXTERNAL_ENCAPSULATING_STRUCTURE 2.122080 0.5969957 0.0000003 0.0001473 0.6749629 109 C1QC/C1QB/CLU/C1QA/SERPING1/…
GOCC_COLLAGEN_TRIMER 2.059335 0.9498680 0.0000009 0.0002230 0.6594444 10 C1QC/C1QB/MARCO/C1QA/COL8A2
GOCC_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX 2.113216 0.6098160 0.0000029 0.0005685 0.6272567 89 C1QC/C1QB/C1QA/SERPING1/LGALS3BP/…
GOCC_HIGH_DENSITY_LIPOPROTEIN_PARTICLE 1.911153 0.9667868 0.0000337 0.0055450 0.5573322 7 CLU/LCAT/PLA2G7
GOCC_CELL_SURFACE 1.570645 0.3805374 0.0000698 0.0098402 0.5384341 362 CLU/ABCA1/CD40/FCGR2B/FOLR3/…
GOCC_IMMUNOGLOBULIN_COMPLEX 1.775126 0.5226512 0.0004965 0.0535858 0.4772708 83 IGLV3-21/IGLV3-1/TRDC/IGLV3-19/IGKV2D-30/…
GOCC_DNA_POLYMERASE_COMPLEX -1.827871 -0.7049446 0.0010862 0.0765785 0.4550599 18 MAD2L2/POLD2/POLG/POLD3/PRIM2/…
GOCC_INTRACELLULAR_PROTEIN_CONTAINING_COMPLEX -1.331250 -0.2914627 0.0006548 0.0538577 0.4772708 576 PPP2R2D/SYVN1/PEF1/FBXW5/DERL2/…
GOCC_MITOCHONDRIAL_PROTEIN_CONTAINING_COMPLEX -1.474939 -0.3492730 0.0005972 0.0535858 0.4772708 251 MFN1/TIMM22/CYC1/MRPL55/TIMM50/…
GOCC_THO_COMPLEX -1.762032 -0.8997731 0.0005736 0.0535858 0.4772708 6 THOC1/THOC2/THOC6
GOCC_TRANSFERASE_COMPLEX -1.350484 -0.2945855 0.0003318 0.0409368 0.4984931 609 CCNY/HCFC2/PIGM/SYVN1/DLST/…
GOCC_CATALYTIC_COMPLEX -1.407214 -0.2954801 0.0000008 0.0002230 0.6594444 1120 CCNY/HCFC2/PPP2R2D/PIGM/DLST/…

11.2.3.4.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOCC_UBIQUITIN_CONJUGATING_ENZYME_COMPLEX -1.776950 -0.8913531 0.0001535 0.0738115 0.5188481 7 UBE2B/RNF20/UBE2N/UBE2A
GOCC_MITOCHONDRIAL_PROTEIN_CONTAINING_COMPLEX -1.749688 -0.4445528 0.0000022 0.0020777 0.6272567 227 SDHB/CHCHD3/MRPS9/NDUFS4/NDUFB9/…

11.2.3.5 CP_REACTOME

11.2.3.5.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.5.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.5.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.5.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
REACTOME_NUCLEAR_SIGNALING_BY_ERBB4 2.237662 0.9034019 0.0000124 0.0016262 0.5933255 15 S100B/TAB2/ADAM17
REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM 1.515736 0.3530944 0.0003204 0.0297386 0.4984931 501 S100B/FASLG/NFKBIA/EIF2AK2/IFNGR1/…
REACTOME_SIGNALING_BY_ERBB4 2.250647 0.7763313 0.0005032 0.0417958 0.4772708 29 S100B/TAB2
REACTOME_DDX58_IFIH1_MEDIATED_INDUCTION_OF_INTERFERON_ALPHA_BETA 2.189480 0.6625634 0.0007737 0.0530800 0.4772708 59 S100B/NFKBIA/TNFAIP3/HERC5/IRF7/…
REACTOME_FASL_CD95L_SIGNALING 1.760740 0.9416871 0.0009368 0.0615960 0.4772708 5 FASLG/CASP10
REACTOME_TAK1_ACTIVATES_NFKB_BY_PHOSPHORYLATION_AND_ACTIVATION_OF_IKKS_COMPLEX 2.084233 0.7189291 0.0014383 0.0785566 0.4550599 29 S100B/NFKBIA/S100A12/NKIRAS2/TRAF6
REACTOME_ADVANCED_GLYCOSYLATION_ENDPRODUCT_RECEPTOR_SIGNALING 2.034035 0.8809841 0.0013110 0.0785566 0.4550599 11 S100B/LGALS3/S100A12
REACTOME_TRAF6_MEDIATED_NF_KB_ACTIVATION 2.216092 0.8013799 0.0016318 0.0804660 0.4550599 22 S100B/NFKBIA/S100A12/NKIRAS2/TRAF6
REACTOME_MITOTIC_TELOPHASE_CYTOKINESIS 1.853543 0.7906865 0.0016039 0.0804660 0.4550599 12 KIF23/PLK1/WAPL/PDS5B/PDS5A/…
REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL 1.929572 0.5127398 0.0021142 0.0953214 0.4317077 130 TRAV8-4/KIR3DL1/IGLV3-21/IGKV4-1/IGLV3-1/…
REACTOME_ACTIVATION_OF_THE_MRNA_UPON_BINDING_OF_THE_CAP_BINDING_COMPLEX_AND_EIFS_AND_SUBSEQUENT_BINDING_TO_43S -1.912835 -0.5771269 0.0004972 0.0417958 0.4772708 52 RPS27/EIF4EBP1/RPS2/RPS23/EIF4B/…
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -1.967008 -0.5513496 0.0000034 0.0004815 0.6272567 99 RPL13/RPL37A/RPS27/RPL10A/RPL4/…
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -1.987362 -0.5469227 0.0000017 0.0003003 0.6435518 107 RPL13/SEC61G/RPL37A/RPS27/RPL10A/…
REACTOME_SELENOAMINO_ACID_METABOLISM -2.033909 -0.5688141 0.0000008 0.0001575 0.6594444 98 RPL13/RPL37A/RPL10A/RPL4/RPL27A/…
REACTOME_NONSENSE_MEDIATED_DECAY_NMD -2.028055 -0.5551614 0.0000004 0.0001098 0.6749629 110 RPL13/RPL37A/RPS27/RPL10A/NCBP2/…
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -2.105282 -0.5998711 0.0000004 0.0001098 0.6749629 86 RPL13/RPL37A/RPL10A/RPL4/RPL27A/…
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION -2.057748 -0.5802567 0.0000002 0.0000858 0.6901325 94 RPL13/RPL37A/RPS27/RPL10A/RPL4/…
REACTOME_METABOLISM_OF_AMINO_ACIDS_AND_DERIVATIVES -1.859176 -0.4578624 0.0000002 0.0000827 0.6901325 252 RPL13/PSMB10/OGDH/RPL37A/PSMC3/…
REACTOME_REGULATION_OF_EXPRESSION_OF_SLITS_AND_ROBOS -2.044104 -0.5317577 0.0000001 0.0000459 0.7195128 150 RPL13/PSMB10/RPL37A/PSMC3/RPS27/…
REACTOME_SIGNALING_BY_ROBO_RECEPTORS -2.025743 -0.5175251 0.0000000 0.0000221 0.7477397 180 RPL13/PSMB10/VASP/RPL37A/PSMC3/…

11.2.3.5.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
REACTOME_CELL_SURFACE_INTERACTIONS_AT_THE_VASCULAR_WALL 1.931837 0.5203953 0.0001405 0.0121494 0.5188481 114 IGKV2D-30/IGHV2-70/JCHAIN/IGLV7-43/DOK2/…
REACTOME_ROLE_OF_PHOSPHOLIPIDS_IN_PHAGOCYTOSIS 1.966460 0.5749386 0.0002815 0.0219183 0.4984931 71 IGHG2/IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/…
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 1.955028 0.5939960 0.0004891 0.0346166 0.4772708 55 IGHG2/IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/…
REACTOME_ROLE_OF_LAT2_NTAL_LAB_ON_CALCIUM_MOBILIZATION 1.916228 0.5752834 0.0007124 0.0482238 0.4772708 60 IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/IGKV1D-16/…
REACTOME_COMPLEMENT_CASCADE 1.947265 0.5812776 0.0008428 0.0529726 0.4772708 64 IGHG2/IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/…
REACTOME_FCGR_ACTIVATION 1.871883 0.5619701 0.0011434 0.0642208 0.4550599 60 IGHG2/IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/…
REACTOME_FCERI_MEDIATED_CA_2_MOBILIZATION 1.862655 0.5404683 0.0011961 0.0642208 0.4550599 75 IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/IGKV1D-16/…
REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 1.933300 0.5891708 0.0014435 0.0721250 0.4550599 53 IGHG2/IGKV2D-30/IGHV2-70/IGLV7-43/IGKV2D-28/…
REACTOME_SCAVENGING_OF_HEME_FROM_PLASMA 1.897282 0.5867983 0.0017384 0.0820200 0.4550599 49 IGKV2D-30/IGHV2-70/IGKV2D-28/IGHA1/IGKV1D-16/…
REACTOME_FCERI_MEDIATED_MAPK_ACTIVATION 1.806300 0.5281123 0.0020414 0.0908137 0.4317077 71 IGKV2D-30/JUN/IGHV2-70/IGLV7-43/IGKV2D-28/…
REACTOME_RESPIRATORY_ELECTRON_TRANSPORT -1.884359 -0.5378053 0.0000102 0.0010639 0.5933255 96 COX16/COX6C/COX7B/NDUFA4/UQCR11/…
REACTOME_INFLUENZA_INFECTION -1.864096 -0.5015747 0.0000031 0.0003989 0.6272567 150 RPS14/RPL8/RPL7A/POLR2I/RPS25/…
REACTOME_CELLULAR_RESPONSE_TO_STARVATION -1.905033 -0.5187183 0.0000014 0.0002366 0.6435518 136 RPS14/RPL8/RPL7A/RPS25/RPL35A/…
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -2.074744 -0.5865145 0.0000000 0.0000091 0.7195128 108 RPS14/RPL8/RPL7A/RPS25/RPL35A/…
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION -2.090081 -0.5983459 0.0000000 0.0000091 0.7195128 95 RPS14/RPL7A/RPS25/RPL35A/RPS17/…
REACTOME_REGULATION_OF_EXPRESSION_OF_SLITS_AND_ROBOS -2.041356 -0.5502725 0.0000000 0.0000029 0.7477397 149 RPS14/RPL8/RPL7A/RPS25/PSMD5/…
REACTOME_SELENOAMINO_ACID_METABOLISM -2.128612 -0.6085994 0.0000000 0.0000029 0.7477397 98 RPS14/RPL8/RPL7A/RPS25/RPL35A/…
REACTOME_NONSENSE_MEDIATED_DECAY_NMD -2.149436 -0.6056000 0.0000000 0.0000021 0.7749390 111 RPS14/RPL8/RPL7A/SMG8/RPS25/…
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -2.211749 -0.6367378 0.0000000 0.0000021 0.7614608 87 RPS14/RPL7A/RPS25/RPL35A/RPS17/…
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -2.184671 -0.6246276 0.0000000 0.0000015 0.7881868 98 RPS14/RPL8/RPL7A/RPS25/RPL35A/…

11.2.3.5.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 2.127200 0.6453466 0.0000285 0.0148474 0.5756103 48 IGLV3-21/IGKV2-30/IGHV1-2/IGKV3-20/IGHV3-48/…
REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 2.078025 0.6367109 0.0000274 0.0148474 0.5756103 45 IGLV3-21/IGKV2-30/IGHV1-2/IGKV3-20/IGHV3-48/…
REACTOME_COMPLEMENT_CASCADE 2.066804 0.5998780 0.0000640 0.0200087 0.5384341 57 IGLV3-21/IGKV2-30/CLU/IGHV1-2/IGKV3-20/…
REACTOME_CD22_MEDIATED_BCR_REGULATION 2.024266 0.6202391 0.0000586 0.0200087 0.5573322 45 IGLV3-21/IGKV2-30/IGHV1-2/IGKV3-20/IGHV3-48/…
REACTOME_HDMS_DEMETHYLATE_HISTONES 2.067104 0.6666766 0.0001782 0.0422035 0.5188481 36 H3C2/H3C10/ARID5B/H3C7/KDM5D/…
REACTOME_FORMATION_OF_SENESCENCE_ASSOCIATED_HETEROCHROMATIN_FOCI_SAHF 2.044491 0.8274335 0.0002370 0.0422035 0.5188481 13 H1-4/H1-2/H1-3/ASF1A/H1-5
REACTOME_INTERLEUKIN_7_SIGNALING 2.005942 0.7153777 0.0002543 0.0422035 0.4984931 21 H3C2/H3C10/H3C7/H3C3/H3C13/…
REACTOME_ROLE_OF_LAT2_NTAL_LAB_ON_CALCIUM_MOBILIZATION 1.955232 0.5872321 0.0003124 0.0422035 0.4984931 50 IGLV3-21/IGKV2-30/IGHV1-2/IGKV3-20/IGHV3-48/…
REACTOME_FCGR_ACTIVATION 1.950651 0.5858563 0.0003238 0.0422035 0.4984931 50 IGLV3-21/IGKV2-30/IGHV1-2/IGKV3-20/IGHV3-48/…
REACTOME_PEPTIDE_LIGAND_BINDING_RECEPTORS 1.960903 0.6488797 0.0008084 0.0602090 0.4772708 30 CCR7/CXCR4/CCL3L3/POMC/CCR5/…
REACTOME_RAB_GERANYLGERANYLATION -1.732910 -0.6004349 0.0020598 0.0826044 0.4317077 33 RAB4B/RAB7A/RAB2B/RAB8A/RAB39B/…
REACTOME_METABOLISM_OF_ANGIOTENSINOGEN_TO_ANGIOTENSINS -1.734302 -0.8540047 0.0017201 0.0747270 0.4550599 7 ATP6AP2/CTSZ/AOPEP/ACE/GZMH
REACTOME_METABOLISM_OF_AMINO_ACIDS_AND_DERIVATIVES -1.487242 -0.3801159 0.0014347 0.0723806 0.4550599 245 OAZ1/MTR/MTRR/SARDH/SEM1/…
REACTOME_REGULATION_OF_EXPRESSION_OF_SLITS_AND_ROBOS -1.586072 -0.4296856 0.0010839 0.0627842 0.4550599 147 NCBP2/SEM1/RPL10/RPL7A/RPL14/…
REACTOME_SIGNALING_BY_ROBO_RECEPTORS -1.586987 -0.4205260 0.0008839 0.0627842 0.4772708 173 NCBP2/VASP/SEM1/RPL10/RPL7A/…
REACTOME_NUCLEOTIDE_EXCISION_REPAIR -1.674600 -0.4730561 0.0010321 0.0627842 0.4550599 104 USP7/RFC2/RAD23B/RFC3/GTF2H2/…
REACTOME_ASSOCIATION_OF_TRIC_CCT_WITH_TARGET_PROTEINS_DURING_BIOSYNTHESIS -1.826008 -0.6527378 0.0010579 0.0627842 0.4550599 28 LONP2/CCT2/CCT4/DCAF7/CCNE2/…
REACTOME_SIGNALING_BY_MET -1.834942 -0.6195994 0.0007180 0.0563916 0.4772708 37 RAB4B/PTPN2/CBL/RANBP9/NRAS/…
REACTOME_TRANSLATION -1.498196 -0.3771111 0.0005766 0.0548477 0.4772708 277 MRPL35/MRPL38/APEH/SSR1/MRPS16/…
REACTOME_NERVOUS_SYSTEM_DEVELOPMENT -1.556695 -0.3854789 0.0002612 0.0422035 0.4984931 320 DLG1/NCBP2/VASP/APH1B/DLG3/…

11.2.3.5.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
REACTOME_COMPLEMENT_CASCADE 2.328440 0.7095176 0.0000001 0.0001199 0.7049757 61 C1QC/C1QB/CLU/C1QA/IGHV1-2/…
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 2.334721 0.7274181 0.0000004 0.0003283 0.6749629 51 C1QC/C1QB/C1QA/IGHV1-2/IGLV3-21/…
REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 2.300390 0.7276567 0.0000010 0.0004966 0.6594444 47 C1QC/C1QB/C1QA/IGHV1-2/IGLV3-21/…
REACTOME_INTERFERON_SIGNALING 1.912553 0.5074714 0.0000025 0.0009754 0.6272567 158 FCGR1A/IFI27/IFITM2/HERC5/MT2A/…
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING 2.138732 0.6781072 0.0000342 0.0089123 0.5573322 46 IFITM2/IFITM3/IFITM1/RSAD2/IFI6/…
REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL 1.946263 0.5333351 0.0000416 0.0092853 0.5573322 122 FCGR1A/IFITM1/IGHV1-2/IGLV3-21/IGLV3-1/…
REACTOME_BINDING_AND_UPTAKE_OF_LIGANDS_BY_SCAVENGER_RECEPTORS 1.933008 0.5894729 0.0001498 0.0208972 0.5188481 59 MARCO/IGHV1-2/IGLV3-21/IGLV3-1/HP/…
REACTOME_FCGR_ACTIVATION 1.975826 0.6105527 0.0002215 0.0266277 0.5188481 52 FCGR1A/IGHV1-2/IGLV3-21/IGLV3-1/IGLV3-19/…
REACTOME_FORMATION_OF_FIBRIN_CLOT_CLOTTING_CASCADE 1.853646 0.8910449 0.0004117 0.0459580 0.4984931 8 SERPING1/PF4/F5/F13A1
REACTOME_ANTI_INFLAMMATORY_RESPONSE_FAVOURING_LEISHMANIA_PARASITE_INFECTION 1.714911 0.4837702 0.0005785 0.0602809 0.4772708 104 FCGR1A/ADM/IGHV1-2/IGLV3-21/IGLV3-1/…
REACTOME_M_PHASE -1.438456 -0.3445277 0.0008100 0.0744759 0.4772708 238 AAAS/PPP2R2D/SDCCAG8/AHCTF1/HAUS1/…
REACTOME_GLOBAL_GENOME_NUCLEOTIDE_EXCISION_REPAIR_GG_NER -1.817091 -0.5111542 0.0001604 0.0208972 0.5188481 74 POLD2/XPC/PCNA/POLD3/UBE2N/…
REACTOME_RHOBTB_GTPASE_CYCLE -2.002547 -0.6679343 0.0000925 0.0160571 0.5384341 30 CCT2/VIM/HNRNPC/STK38/COPS2/…
REACTOME_RHOBTB1_GTPASE_CYCLE -2.056884 -0.7621998 0.0000516 0.0100903 0.5573322 19 CCT2/VIM/HNRNPC/STK38/COPS2/…

11.2.3.5.3.5 cluster_5
pathway NES ES pval padj log2err size leadingEdge
REACTOME_INTERFERON_SIGNALING 2.208966 0.5825714 0.0e+00 0.0000496 0.7195128 142 IFI27/IFIT1/IFITM1/IFI6/ISG15/…
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING 2.520403 0.7853826 1.7e-06 0.0013139 0.6435518 42 IFIT1/IFITM1/IFI6/ISG15/MX1/…

11.2.3.5.3.6 cluster_6
pathway NES ES pval padj log2err size leadingEdge
REACTOME_INTERFERON_SIGNALING 2.321979 0.6088425 0.0000000 0.0000172 0.7477397 138 IFI27/IFI6/IFITM1/IFITM3/XAF1/…
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING 2.543723 0.7886879 0.0000001 0.0000974 0.6901325 43 IFI6/IFITM1/IFITM3/XAF1/MX1/…
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 2.207567 0.8119057 0.0000623 0.0316444 0.5384341 19 IGLV1-40/C1QB/IGLV3-21/IGKV3-20/FCN1/…
REACTOME_COMPLEMENT_CASCADE 2.266258 0.7768909 0.0001013 0.0385509 0.5384341 24 IGLV1-40/C1QB/IGLV3-21/IGKV3-20/FCN1/…
REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM 1.593366 0.3619088 0.0001716 0.0522715 0.5188481 386 IFI27/IFI6/IFITM1/IFITM3/XAF1/…

11.2.3.6 CP_KEGG

11.2.3.6.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.6.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.6.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.6.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
KEGG_GRAFT_VERSUS_HOST_DISEASE 2.014249 0.6957889 0.0003162 0.0292455 0.4984931 32 FASLG/KIR3DL1/HLA-DRB5/HLA-DOB/HLA-DRB1/…
KEGG_RIBOSOME -2.052827 -0.5831366 0.0000021 0.0003909 0.6272567 84 RPL13/RPL37A/RPS27/RPL10A/RPL4/…

11.2.3.6.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
KEGG_ALZHEIMERS_DISEASE -1.559612 -0.4242751 0.0031687 0.0982283 0.4317077 124 COX6C/COX7B/NDUFA4/TNF/UQCR11/…
KEGG_OXIDATIVE_PHOSPHORYLATION -1.603407 -0.4425057 0.0023696 0.0982283 0.4317077 112 COX6C/COX7B/NDUFA4/UQCR11/COX7C/…
KEGG_ASTHMA -1.722090 -0.6940613 0.0027546 0.0982283 0.4317077 16 TNF/HLA-DPB1/HLA-DRA
KEGG_PARKINSONS_DISEASE -1.735147 -0.4814700 0.0002654 0.0164542 0.4984931 106 COX6C/COX7B/NDUFA4/UQCR11/COX7C/…
KEGG_CARDIAC_MUSCLE_CONTRACTION -1.992528 -0.7030220 0.0000481 0.0044722 0.5573322 30 COX6C/COX7B/UQCR11/COX7C/ATP1B3/…
KEGG_RIBOSOME -2.254719 -0.6477164 0.0000000 0.0000007 0.7614608 84 RPL8/RPL7A/RPS25/RPL35A/RPS17/…

11.2.3.6.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 1.894826 0.534839 0.0003954 0.0731415 0.4984931 71 H3C2/H3C10/H3C7/H3C3/HLA-DRB5/…

11.2.3.6.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 2.272672 0.6778046 3.00e-07 0.0000508 0.6749629 65 C1QC/C1QB/FCGR1A/C1QA/CD40/…
KEGG_COMPLEMENT_AND_COAGULATION_CASCADES 2.074830 0.8554510 6.37e-05 0.0059279 0.5384341 15 C1QC/C1QA/SERPING1/CR1/F5/…
KEGG_PRION_DISEASES 2.036097 0.7755857 1.67e-04 0.0103536 0.5188481 21 C1QC/C1QB/C1QA/IL1B/CCL5/…

11.2.3.6.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
KEGG_HUNTINGTONS_DISEASE -1.683514 -0.4568229 0.0005711 0.0303615 0.4772708 125 SDHB/NDUFS4/APAF1/NDUFB9/NDUFS7/…
KEGG_OXIDATIVE_PHOSPHORYLATION -1.691128 -0.4645116 0.0006600 0.0303615 0.4772708 106 SDHB/NDUFS4/NDUFB9/NDUFS7/NDUFB7/…
KEGG_PARKINSONS_DISEASE -1.725441 -0.4776757 0.0005921 0.0303615 0.4772708 102 SDHB/NDUFS4/APAF1/UBE2J2/NDUFB9/…
KEGG_LYSOSOME -1.859359 -0.5351416 0.0000448 0.0082472 0.5573322 83 LAPTM4A/LAPTM5/AP3B1/CD63/GNS/…

11.2.3.7 TFT_GTRD

11.2.3.7.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.7.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.7.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.7.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
SMCHD1_TARGET_GENES 1.922524 0.6866805 0.0008621 0.0296569 0.4772708 25 NBPF1/TRGV5P/LERFS/TRAV10/MT-ND6/…
HOXC11_TARGET_GENES 2.152064 0.7293295 0.0010486 0.0338166 0.4550599 32 S100B/DUSP6
ZNF101_TARGET_GENES -1.436313 -0.3600726 0.0062072 0.0889699 0.4070179 190 NEMF/CNEP1R1/FBXO8/SNRPD1/ENTPD1-AS1/…
PCGF1_TARGET_GENES -1.380740 -0.3270025 0.0045587 0.0823864 0.4070179 337 RCOR3/ZNF280D/DCTN1/PRKAG2/GK5/…
ZBED4_TARGET_GENES -1.486954 -0.3917365 0.0033962 0.0700976 0.4317077 139 MFSD11/SEC61G/VPS26C/FAM53C/ETS2/…
FOXE1_TARGET_GENES -1.386887 -0.3170301 0.0005339 0.0229570 0.4772708 508 MFSD11/RPL13/BABAM1/ATF5/IER2/…
DLX6_TARGET_GENES -1.482054 -0.3440281 0.0001914 0.0123457 0.5188481 429 GMPR2/NEMF/CSNK2B/COX7A2L/VASP/…
PHF2_TARGET_GENES -1.370358 -0.2998527 0.0000684 0.0058824 0.5384341 1016 RPL13/RCOR3/POP7/GALNS/CSNK2B/…
RFX7_TARGET_GENES -1.554477 -0.3653217 0.0000481 0.0049651 0.5573322 379 GMPR2/POP7/C12orf4/ATF5/NEMF/…
NKX2_5_TARGET_GENES -1.425889 -0.3148294 0.0000334 0.0043071 0.5573322 830 MFSD11/METTL3/COX7A2L/PALLD/FBXO8/…
ELF2_TARGET_GENES -1.381361 -0.2999203 0.0000063 0.0016220 0.6105269 1206 BABAM1/C12orf4/IER2/FBXO8/CHFR/…
NFE2L1_TARGET_GENES -1.428544 -0.3084065 0.0000002 0.0000780 0.6901325 1398 GMPR2/MFSD11/RMC1/BABAM1/BTG2/…

11.2.3.7.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
ZNF236_TARGET_GENES -1.479432 -0.3940172 0.0044454 0.0997317 0.4070179 156 MEF2A/COX6C/SLX4IP/TSPAN31/MALAT1/…
MCM3_TARGET_GENES -1.546546 -0.4270079 0.0032270 0.0756886 0.4317077 115 FTSJ3/SLX4IP/ZNF432/MRPL52/DEDD/…
ZNF197_TARGET_GENES -1.395317 -0.3383996 0.0030195 0.0741930 0.4317077 376 BZW2/DHX40/SLX4IP/NDUFA4/SIPA1L3/…
TERF2_TARGET_GENES -1.589418 -0.4595987 0.0024678 0.0636705 0.4317077 87 TMBIM1/ATP5MJ/C1orf74/NF1/TBL1XR1/…
CIITA_TARGET_GENES -1.348953 -0.3065996 0.0003165 0.0183378 0.4984931 970 N4BP1/FTSJ3/SLX4IP/ZNF432/BTN2A2/…
FXR1_TARGET_GENES -1.375623 -0.3175854 0.0003198 0.0183378 0.4984931 716 COX16/CNOT8/FTSJ3/NDUFA4/SIPA1L3/…
BANP_TARGET_GENES -1.432976 -0.3341123 0.0002902 0.0183378 0.4984931 573 COX7B/PPM1G/SLX4IP/SPTY2D1/OFD1/…
RUVBL1_TARGET_GENES -1.730227 -0.4706364 0.0000899 0.0115936 0.5384341 132 RPS14/FTSJ3/SLX4IP/CCDC47/FMC1/…
NFRKB_TARGET_GENES -1.362722 -0.3038247 0.0000094 0.0024168 0.5933255 1489 MEF2A/COX16/FTSJ3/COX7B/PRMT1/…
DLX4_TARGET_GENES -1.523407 -0.3536348 0.0000055 0.0024168 0.6105269 648 COX16/FTSJ3/PAIP2B/NDUFA4/CSPP1/…

11.2.3.7.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
PSMB5_TARGET_GENES 1.448105 0.3486876 0.0042410 0.0989136 0.4070179 214 H1-4/H1-2/H3C10/PMAIP1/DDIT4/…
ZNF213_TARGET_GENES -1.388793 -0.3375391 0.0041518 0.0989136 0.4070179 384 DNAJB12/KXD1/GLYR1/GLOD4/RFX1/…
IRF5_TARGET_GENES -1.368287 -0.3250624 0.0037916 0.0978228 0.4317077 538 RAB4B/COMMD1/RABL2B/CREBZF/RFC3/…
NPM1_TARGET_GENES -1.412006 -0.3529889 0.0036984 0.0978228 0.4317077 275 SEPTIN2/CSPP1/IREB2/DUSP12/COA1/…
PITX1_TARGET_GENES -1.459145 -0.3664280 0.0026417 0.0801821 0.4317077 258 RAB4B/GUSB/DNAJB12/COMMD1/ATG9A/…
GLI3_TARGET_GENES -1.375675 -0.3260530 0.0021700 0.0699816 0.4317077 546 RAB4B/DLG1/EDC3/MUTYH/SEPTIN2/…
NFE2L3_TARGET_GENES -1.720815 -0.9286220 0.0010710 0.0425119 0.4550599 5 TEP1
ARID5B_TARGET_GENES -1.380678 -0.3215926 0.0007853 0.0377204 0.4772708 690 GUSB/CRBN/XRCC5/ZNF37A/PLD3/…
GUCY1B1_TARGET_GENES -1.422982 -0.3383012 0.0006488 0.0371979 0.4772708 509 GUSB/PLD3/DRAM2/WASHC5/ADCK1/…
ZNF2_TARGET_GENES -1.403596 -0.3256464 0.0002489 0.0256816 0.4984931 757 DNAJB12/CRBN/NCBP2/TBL3/CCT2/…
TAF9B_TARGET_GENES -1.529550 -0.3690157 0.0000568 0.0146545 0.5573322 425 MRPL35/COMMD1/SEPTIN2/IMMP1L/WDR74/…

11.2.3.7.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
PSMB5_TARGET_GENES 1.655767 0.4197568 0.0003547 0.0382911 0.4984931 219 ID1/ID2/SNHG5/ADM/HSPA1B/…
ZNF592_TARGET_GENES -1.215365 -0.2552286 0.0014816 0.0707716 0.4550599 1253 AAAS/PPP2R2D/PDIA5/CALCOCO1/GOSR2/…
BARX2_TARGET_GENES -1.232782 -0.2586457 0.0009855 0.0707716 0.4550599 1292 TRIM26/WBP11/CCT2/RAB11B-AS1/RAD51B/…
SUPT20H_TARGET_GENES -1.254619 -0.2653314 0.0008977 0.0707716 0.4772708 1025 ILF3-DT/TMEM106B/AAAS/WBP11/RAB11B-AS1/…
E2F5_TARGET_GENES -1.261834 -0.2719733 0.0016490 0.0707716 0.4550599 782 C19orf53/PTP4A3/STAT5B/C11orf21/RAD51B/…
INSM2_TARGET_GENES -1.348438 -0.2996003 0.0013726 0.0707716 0.4550599 500 ILF3-DT/CCT2/RAB11B-AS1/WARS1/ARPP19/…
NPM1_TARGET_GENES -1.408016 -0.3307216 0.0016186 0.0707716 0.4550599 281 ABCD4/COPA/YBX1/THOC1/NFATC3/…
E2F2_TARGET_GENES -1.267328 -0.2679354 0.0003718 0.0382911 0.4984931 1040 C19orf53/ILF3-DT/FRAT1/AIFM1/ITSN1/…
ZNF830_TARGET_GENES -1.641045 -0.4030440 0.0001248 0.0214185 0.5188481 196 ABCD4/EIF6/THOC1/NDUFB7/ZSCAN30/…
ZNF407_TARGET_GENES -1.279812 -0.2668954 0.0000466 0.0163953 0.5573322 1396 ILF3-DT/TMEM106B/AAAS/SYVN1/ITSN1/…
ELF2_TARGET_GENES -1.305142 -0.2747181 0.0000637 0.0163953 0.5384341 1167 ZUP1/RAB11B-AS1/MFN1/DPH3/GOSR2/…

11.2.3.7.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
NFE2L1_TARGET_GENES -1.415033 -0.3228922 8.2e-06 0.0041929 0.5933255 1023 KIDINS220/CHCHD3/CAP1/TRMT112/TNPO3/…

11.2.3.8 TFT_TFT_Legacy

11.2.3.8.1 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.8.2 Summaryplot: Healthy

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Healthy. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.2.3.8.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Mild and Healthy for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Mild relative to Healthy, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.2.3.8.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
SCGGAAGY_ELK1_02 -1.332276 -0.2913055 0.0001058 0.0645449 0.5384341 1074 GMPR2/BABAM1/CSNK2B/YTHDF2/CCP110/…

11.2.3.8.3.2 cluster_3
pathway NES ES pval padj log2err size leadingEdge
KRCTCNNNNMANAGC_UNKNOWN 2.460644 0.7779709 0.00e+00 0.0000280 0.7195128 37 H1-4/H1-2/H3C2/H3C10/H3C7/…
TTTNNANAGCYR_UNKNOWN 2.032007 0.5781782 3.45e-05 0.0105254 0.5573322 65 H1-4/H1-2/H3C2/H3C10/H3C7/…
SCGGAAGY_ELK1_02 -1.332961 -0.3048625 3.16e-04 0.0642453 0.4984931 1042 RAB4B/PTPN2/KXD1/PPP1R35/UFD1/…

11.2.3.8.3.3 cluster_4
pathway NES ES pval padj log2err size leadingEdge
FREAC4_01 1.856993 0.5632482 0.0007946 0.0973142 0.4772708 62 ID2/CDKN1A/CAMK1D/CITED2/CTCF/…
GATA_Q6 1.854005 0.5680466 0.0005867 0.0973142 0.4772708 60 CLU/ADM/LCAT/CACNA2D3/STON2/…
E12_Q6 1.794529 0.4997787 0.0004455 0.0973142 0.4984931 105 C1QC/CAMK1D/FAM53C/MIR22HG/ACAP1/…
E47_01 1.710654 0.4642730 0.0007977 0.0973142 0.4772708 123 C1QC/CAMK1D/CCNYL1/EPHB2/PPTC7/…
MYOD_01 1.677834 0.4629032 0.0009672 0.0983319 0.4772708 114 C1QC/FAM53C/MIR22HG/TICAM1/ACAP1/…
CCCNNNNNNAAGWT_UNKNOWN -1.754053 -0.5005989 0.0002481 0.0973142 0.4984931 71 IRF9/NCDN/CPEB4/DHRS9/MRPL49/…

11.3 Covid_Critical_vs_Covid_Mild

11.3.1 Number of significant DEGs across conditions in each cluster

Here we check number of differentially expressed genes (DEGs) after applying some statistical thresholds:

  • Adjusted P value < 0.1

  • Log Fold Change > +/- 0

  • Proportion of cells expressing gene in condition A > 0.1 if LFC is positive

  • Proportion of cells expressing gene in condition B > 0.1 if LFC is negative

Differential expression compares genes in condition A (left) versus condition B (right). The left column indicates the number of genes upregulated in the left condition, while the right column indicates genes upregulated in the right condition.

Finally, the thresholds above do not affect downstream results from pathway analysis, they are just meant to count the number of DEGs. Downstream usage of DEGs can use these thresholds, or you can choose other appropriate cutoffs.

Cluster Covid_Critical_high Covid_Mild_high
cluster_1 193 66
cluster_2 4 1
cluster_3 9 5
cluster_4 60 51
cluster_5 0 0
cluster_6 6 5
cluster_7 29 11

11.3.2 Per-cluster heatmaps of all DEGs

To ensure the differential expression results are robust, it is helpful to inspect all DEGs by visualizing them in a heatmap. For each cluster, we compare the DEGs at both the single cell level, and either the pseudobulk level (if we used pseudobulk_edgeR) or the average RISC value level (if using wilcox).

11.3.2.1 Cluster_1

11.3.2.1.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.1.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.2 Cluster_2

11.3.2.2.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.2.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.3 Cluster_3

11.3.2.3.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.3.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.4 Cluster_4

11.3.2.4.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.4.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.5 Cluster_6

11.3.2.5.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.5.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.6 Cluster_7

11.3.2.6.1 Differentially Expressed Gene Heatmap for all cells in this cluster

Here, we plot all significant DEGs for all cells in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.2.6.2 Differentially Expressed Gene Heatmap plotted at pseudobulk level for replicates for this cluster

Here, we plot all significant DEGs for pseudobulked replicates in this cluster from condition A vs condition B. Condition A is denoted with positive fold change.

11.3.3 Gene set enrichment analysis across conditions

Pathway analysis for the cross-condition analysis is performed for overexpressed and underexpressed genes for each cluster. This is done via Gene Set Enrichment Analysis (GSEA) (Subramanian et al 2005).

GSEA is preferred over other pathway analysis such as fisher tests or chi-square tests because it does not require making arbitrary cutoffs to the number of DEGs and takes into account how strongly differentially expressed each gene may be. For the latter, data-driven gene-specific weight is applied. We use a standard weighting method of -log10(P-value) * sign of Log Fold Change.

The pathways we choose in pathway analysis are derived from the Molecular Signatures Database (MSIGDB) where they are sorted by categories, such as Gene Ontology (GO) Biological Process, GO Molecular Function, KEGG, Reactome, etc. These are databases that annotate genes by function or molecular pathway.

11.3.3.1 HALLMARK

11.3.3.1.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.1.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.1.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.1.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_E2F_TARGETS 2.737946 0.7895349 0.0000000 0.0000000 1.4885397 192 CENPM/MCM4/TK1/RRM2/MKI67/…
HALLMARK_G2M_CHECKPOINT 2.544265 0.7445422 0.0000000 0.0000000 1.2210538 172 GINS2/MKI67/MCM2/STMN1/CKS2/…
HALLMARK_MYC_TARGETS_V1 2.075492 0.5995023 0.0000000 0.0000000 0.7749390 190 TYMS/MCM4/MCM2/MCM7/DUT/…
HALLMARK_MTORC1_SIGNALING 1.817999 0.5273447 0.0000042 0.0000530 0.6105269 180 MCM4/RRM2/MCM2/DHFR/SHMT2/…
HALLMARK_MITOTIC_SPINDLE 1.705572 0.5017886 0.0001141 0.0011410 0.5384341 164 LMNB1/BIRC5/TOP2A/SMC4/YWHAE/…
HALLMARK_APOPTOSIS 1.747477 0.5305002 0.0001396 0.0011632 0.5188481 126 HMGB2/TOP2A/BAX/GSTM1/CD38/…
HALLMARK_ESTROGEN_RESPONSE_LATE 1.810981 0.5915113 0.0004493 0.0032092 0.4984931 75 PRSS23/FABP5/RBBP8/FOS/HPRT1/…
HALLMARK_SPERMATOGENESIS 1.786145 0.6065984 0.0010041 0.0062753 0.4550599 55 CDKN3/NCAPH/TKTL1/KIF2C/CCNB2/…
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.689822 0.5688202 0.0018845 0.0104697 0.4550599 60 LGALS1/JUN/MAGEE1/GLIPR1/AREG/…
HALLMARK_MYC_TARGETS_V2 1.724708 0.5889784 0.0045457 0.0227287 0.4070179 52 MCM4/PLK1/MCM5/PLK4/UNG/…

11.3.3.1.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_TNFA_SIGNALING_VIA_NFKB 1.730964 0.4704992 0.0001438 0.0071898 0.5188481 130 RHOB/JUN/FOS/TNIP2/IRF1/…
HALLMARK_INTERFERON_ALPHA_RESPONSE 1.579154 0.4599315 0.0033237 0.0830925 0.4317077 85 IFITM3/IRF1/WARS1/IFI30/IFIT3/…
HALLMARK_CHOLESTEROL_HOMEOSTASIS 1.597827 0.4965563 0.0083353 0.0912447 0.3807304 57 FABP5/ATF3/PMVK/FDPS/SCD/…
HALLMARK_MYOGENESIS 1.572948 0.4668065 0.0070259 0.0912447 0.4070179 74 CFD/PTP4A3/PLXNB2/FXYD1/ACSL1/…
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 1.547222 0.4865862 0.0122976 0.0912447 0.3807304 51 RHOB/EMP3/JUN/LGALS1/P3H1/…
HALLMARK_IL2_STAT5_SIGNALING 1.479236 0.4056574 0.0111477 0.0912447 0.3807304 118 IFITM3/RHOB/COCH/SOCS1/KLF6/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.391599 0.3679859 0.0127743 0.0912447 0.3807304 164 IFITM3/IRF1/MTHFD2/WARS1/IFI30/…

11.3.3.1.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_TNFA_SIGNALING_VIA_NFKB 1.908383 0.5229669 0.0000018 0.0000917 0.6435518 126 IFNGR2/FOS/SGK1/JUN/SOCS3/…
HALLMARK_IL2_STAT5_SIGNALING 1.764689 0.4796179 0.0000243 0.0006084 0.5756103 134 IFITM3/SOCS2/PIM1/CISH/PTGER2/…
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.687963 0.4452124 0.0001215 0.0020244 0.5384341 157 IFITM3/PIM1/IFIT1/SOCS3/TNFAIP3/…
HALLMARK_HYPOXIA 1.788345 0.5047328 0.0002017 0.0020807 0.5188481 97 PIM1/FOS/JUN/TNFAIP3/HK2/…
HALLMARK_ALLOGRAFT_REJECTION 1.659341 0.4509858 0.0002081 0.0020807 0.5188481 134 IFNGR2/EIF5A/CD40LG/GZMA/SOCS1/…
HALLMARK_INFLAMMATORY_RESPONSE 1.685743 0.4751696 0.0007592 0.0063266 0.4772708 101 IFNGR2/PTGER2/CCL5/SLC31A2/TNFRSF1B/…
HALLMARK_IL6_JAK_STAT3_SIGNALING 1.754310 0.5574797 0.0026213 0.0187235 0.4317077 49 PIM1/IFNGR2/JUN/SOCS3/SOCS1/…
HALLMARK_ESTROGEN_RESPONSE_LATE 1.656993 0.4950897 0.0033919 0.0188437 0.4317077 68 FOS/CISH/SGK1/AREG/SIAH2/…
HALLMARK_ESTROGEN_RESPONSE_EARLY 1.547493 0.4580171 0.0071503 0.0325014 0.4070179 71 FOS/UGCG/AREG/SIAH2/PPIF/…
HALLMARK_SPERMATOGENESIS 1.568080 0.5051500 0.0133072 0.0554466 0.3807304 47 PHF7/CCNB2/SPATA6/PCSK1N/GSTM3/…
HALLMARK_DNA_REPAIR -1.477229 -0.3700879 0.0057255 0.0286277 0.4070179 133 TSG101/GTF2A2/POLR2G/POLR3C/NFX1/…
HALLMARK_PROTEIN_SECRETION -1.608506 -0.4373577 0.0032887 0.0188437 0.4317077 86 TSG101/VAMP3/M6PR/ATP7A/GNAS/…

11.3.3.1.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
HALLMARK_INTERFERON_GAMMA_RESPONSE 1.918113 0.4947319 0.0000008 0.0000398 0.6594444 171 CXCL10/CD86/PIM1/CXCL9/HLA-DRB1/…
HALLMARK_IL2_STAT5_SIGNALING 1.989652 0.5280380 0.0000020 0.0000494 0.6272567 131 CXCL10/SLC39A8/CD86/PIM1/CCND3/…
HALLMARK_ALLOGRAFT_REJECTION 1.785129 0.4730931 0.0001526 0.0025437 0.5188481 134 FCGR2B/CD86/CCND3/CXCL9/HLA-DMB/…
HALLMARK_G2M_CHECKPOINT 1.693166 0.4408433 0.0002619 0.0032734 0.4984931 156 BIRC5/MKI67/PTTG1/TROAP/NOTCH2/…
HALLMARK_INFLAMMATORY_RESPONSE 1.641244 0.4393283 0.0011885 0.0084889 0.4550599 124 CXCL10/CD82/CXCL9/GNA15/PTGER2/…
HALLMARK_TNFA_SIGNALING_VIA_NFKB 1.639965 0.4295291 0.0009050 0.0084889 0.4772708 149 CXCL10/TUBB2A/KLF10/DUSP2/ATP2B1/…
HALLMARK_E2F_TARGETS 1.593703 0.4101575 0.0010971 0.0084889 0.4550599 174 BIRC5/MKI67/PTTG1/CDKN3/HMGB3/…
HALLMARK_KRAS_SIGNALING_UP 1.671064 0.4623526 0.0026793 0.0167456 0.4317077 101 RETN/CXCL10/CD37/MMD/GYPC/…
HALLMARK_IL6_JAK_STAT3_SIGNALING 1.589935 0.4842787 0.0135668 0.0753711 0.3807304 61 CXCL10/ITGA4/PIM1/CXCL9/CD9/…

11.3.3.2 GO_BP

11.3.3.2.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.2.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.2.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.2.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOBP_PROTEIN_DNA_COMPLEX_ASSEMBLY 2.277259 0.6811043 0.0000000 0.0000000 0.9214260 148 H3C2/CENPN/MCM2/ASF1B/CDC45/…
GOBP_DNA_CONFORMATION_CHANGE 2.166534 0.6164765 0.0000000 0.0000000 0.9214260 237 H3C2/GINS2/MCM4/MCM2/ASF1B/…
GOBP_NUCLEOSOME_ASSEMBLY 2.314559 0.7487408 0.0000000 0.0000000 0.8634154 83 H3C2/MCM2/ASF1B/HMGB2/CENPW/…
GOBP_MEIOTIC_CELL_CYCLE 2.218469 0.6631053 0.0000000 0.0000000 0.8390889 142 CKS2/PKMYT1/CDC20/RAD51/NCAPH/…
GOBP_PROTEIN_DNA_COMPLEX_SUBUNIT_ORGANIZATION 2.132524 0.6188807 0.0000000 0.0000001 0.8140358 186 H3C2/CENPN/MCM2/ASF1B/CDC45/…
GOBP_DNA_PACKAGING 2.140101 0.6272248 0.0000000 0.0000002 0.8140358 173 H3C2/MCM2/ASF1B/HMGB2/NCAPH/…
GOBP_NUCLEOSOME_ORGANIZATION 2.184606 0.6615645 0.0000000 0.0000003 0.8012156 125 H3C2/MCM2/ASF1B/HMGB2/CENPW/…
GOBP_DNA_REPLICATION_INITIATION 2.275093 0.8575246 0.0000000 0.0000008 0.7749390 31 MCM4/MCM2/MCM7/CDC45/MCM10/…
GOBP_CELL_CYCLE_DNA_REPLICATION 2.221539 0.8091916 0.0000000 0.0000092 0.7195128 39 MCM4/MCM2/PCNA/CDC45/RAD51/…
GOBP_DNA_UNWINDING_INVOLVED_IN_DNA_REPLICATION 2.177817 0.8761540 0.0000002 0.0000477 0.6901325 22 GINS2/MCM4/MCM2/MCM7/CDC45/…
GOBP_REGULATION_OF_ACTIN_NUCLEATION -1.865035 -0.7056101 0.0009177 0.0549241 0.4772708 20 WAS/WASF2/AP1AR/ARFIP1/CYFIP1/…
GOBP_URATE_METABOLIC_PROCESS -1.584348 -0.9831093 0.0003745 0.0271984 0.4984931 3 PRPS1/ABCG2
GOBP_REGULATION_OF_ARP2_3_COMPLEX_MEDIATED_ACTIN_NUCLEATION -1.963158 -0.7789338 0.0003658 0.0268287 0.4984931 16 WAS/WASF2/AP1AR/ARFIP1/CYFIP1/…

11.3.3.2.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 2.133270 0.5206155 0.0000000 0.0000000 1.0476265 358 IGHV5-10-1/IGHV3-49/IGKV6-21/IGLV4-3/IGLV5-37/…
GOBP_REGULATION_OF_B_CELL_ACTIVATION 2.344807 0.6307548 0.0000000 0.0000000 0.9865463 150 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_COMPLEMENT_ACTIVATION 2.519309 0.7459768 0.0000000 0.0000000 0.9436322 70 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_HUMORAL_IMMUNE_RESPONSE_MEDIATED_BY_CIRCULATING_IMMUNOGLOBULIN 2.487493 0.7325738 0.0000000 0.0000000 0.9436322 74 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_POSITIVE_REGULATION_OF_B_CELL_ACTIVATION 2.354249 0.6602114 0.0000000 0.0000000 0.9101197 106 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_B_CELL_RECEPTOR_SIGNALING_PATHWAY 2.330385 0.6572601 0.0000000 0.0000000 0.8870750 103 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_MEMBRANE_INVAGINATION 2.333606 0.6574626 0.0000000 0.0000000 0.8753251 100 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_B_CELL_MEDIATED_IMMUNITY 2.200900 0.5948299 0.0000000 0.0000000 0.8753251 146 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_PHAGOCYTOSIS_RECOGNITION 2.397788 0.7209858 0.0000000 0.0000001 0.8390889 64 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_CELL_RECOGNITION 2.128607 0.5962921 0.0000000 0.0000022 0.7614608 108 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOBP_RIBONUCLEOPROTEIN_COMPLEX_BIOGENESIS -1.470557 -0.3363683 0.0006050 0.0916945 0.4772708 361 MRPL1/EBNA1BP2/SRPK2/USP39/METTL18/…
GOBP_POSITIVE_REGULATION_OF_ORGANELLE_ORGANIZATION -1.474498 -0.3452834 0.0005451 0.0843482 0.4772708 311 TNFSF10/CALCOCO2/WRAP73/SLC25A4/ANXA1/…
GOBP_ORGANONITROGEN_COMPOUND_BIOSYNTHETIC_PROCESS -1.282478 -0.2719349 0.0004705 0.0760287 0.4984931 1090 RPL30/SVIP/METTL18/PIGB/RB1CC1/…
GOBP_CYTOPLASMIC_TRANSLATION -1.859527 -0.4822311 0.0000284 0.0055584 0.5756103 136 RPL30/FMR1/RPS5/DHX36/RPL22/…
GOBP_CELLULAR_MACROMOLECULE_BIOSYNTHETIC_PROCESS -1.369944 -0.2937096 0.0000220 0.0045352 0.5756103 997 RPL30/NASP/SVIP/METTL18/PIGB/…

11.3.3.2.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 1.875645 0.4546636 0.0000000 0.0000006 0.8266573 403 TRBV18/TRAV27/HLA-DQB1/TRBV10-1/IGKV2-30/…
GOBP_RESPONSE_TO_BACTERIUM 1.763266 0.4334760 0.0000002 0.0003109 0.6901325 327 TRAV27/FOS/GNLY/TRAV25/IGHV3-49/…
GOBP_NEGATIVE_REGULATION_OF_VIRAL_PROCESS 1.929180 0.5758266 0.0000281 0.0161169 0.5756103 69 IFITM3/IFIT1/OAS2/TRIM11/CCL5/…
GOBP_RECEPTOR_SIGNALING_PATHWAY_VIA_STAT 1.875341 0.5427315 0.0000541 0.0268474 0.5573322 82 SOCS2/SOCS3/SOCS1/TNFRSF18/CCL5/…
GOBP_RESPONSE_TO_MOLECULE_OF_BACTERIAL_ORIGIN 1.692554 0.4483018 0.0000993 0.0389238 0.5384341 164 FOS/PTGER2/TNFAIP3/ARID5A/CCL5/…
GOBP_PLASMA_CELL_DIFFERENTIATION 1.733595 0.9494068 0.0002125 0.0626707 0.5188481 5 LGALS1/XBP1/IL10
GOBP_NEGATIVE_REGULATION_OF_VIRAL_GENOME_REPLICATION 1.931641 0.6115060 0.0002622 0.0650605 0.4984931 45 IFITM3/IFIT1/OAS2/CCL5/MX1/…
GOBP_RESPONSE_TO_ALCOHOL 1.753322 0.4782822 0.0002908 0.0658004 0.4984931 123 CLDN5/FOS/SGK1/CDO1/PTGER2/…
GOBP_TYROSINE_PHOSPHORYLATION_OF_STAT_PROTEIN 1.866332 0.6225493 0.0004476 0.0841860 0.4984931 36 SOCS3/SOCS1/TNFRSF18/CCL5/IL6ST/…
GOBP_RESPONSE_TO_MINERALOCORTICOID 1.884710 0.7786223 0.0006691 0.0976717 0.4772708 14 FOS/SGK1/CYBA
GOBP_TRIPEPTIDE_TRANSMEMBRANE_TRANSPORT -1.639683 -0.9829291 0.0005507 0.0911007 0.4772708 3 ABCC1
GOBP_TRIPEPTIDE_TRANSPORT -1.639683 -0.9829291 0.0005507 0.0911007 0.4772708 3 ABCC1
GOBP_NCRNA_3_END_PROCESSING -1.900619 -0.6150942 0.0004283 0.0839063 0.4984931 36 DKC1/INTS2/TRMT10C/EXOSC10/INTS14/…
GOBP_LYSOSOMAL_TRANSPORT -1.813481 -0.4889308 0.0002202 0.0626707 0.5188481 93 TSG101/RAB7A/M6PR/BIN1/CLEC16A/…
GOBP_VACUOLAR_TRANSPORT -1.767638 -0.4497428 0.0000689 0.0302966 0.5384341 124 TSG101/RAB7A/M6PR/BIN1/CLEC16A/…

11.3.3.2.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOBP_HUMORAL_IMMUNE_RESPONSE 2.165119 0.5878952 0.0000000 0.0000137 0.7881868 127 CXCL10/FCGR2B/C1QA/C1QB/CXCL9/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_OR_POLYSACCHARIDE_ANTIGEN_VIA_MHC_CLASS_II 2.221142 0.7768539 0.0000006 0.0006202 0.6594444 27 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/CTSD/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_PEPTIDE_ANTIGEN_VIA_MHC_CLASS_II 2.237628 0.8115468 0.0000007 0.0006842 0.6594444 23 FCGR2B/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/…
GOBP_ANTIMICROBIAL_HUMORAL_IMMUNE_RESPONSE_MEDIATED_BY_ANTIMICROBIAL_PEPTIDE 2.180670 0.7663601 0.0000024 0.0011792 0.6272567 26 CXCL10/CXCL9/S100A12/RNASE3/HMGN2/…
GOBP_MICROGLIAL_CELL_ACTIVATION 2.166229 0.7468120 0.0000042 0.0018359 0.6105269 30 CX3CR1/CST7/C1QA/LRRK2/FPR2/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_ANTIGEN 2.132286 0.6986919 0.0000044 0.0018359 0.6105269 39 FCGR2B/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/…
GOBP_ANTIMICROBIAL_HUMORAL_RESPONSE 2.073288 0.6587770 0.0000061 0.0022711 0.6105269 47 CXCL10/CXCL9/S100A9/S100A12/RNASE3/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN 2.058996 0.6466273 0.0000100 0.0032469 0.5933255 50 FCGR2B/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/…
GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_EXOGENOUS_PEPTIDE_ANTIGEN 2.129063 0.7265962 0.0000121 0.0037956 0.5933255 32 FCGR2B/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/…
GOBP_DEFENSE_RESPONSE_TO_FUNGUS 2.085321 0.7405266 0.0000393 0.0086822 0.5573322 25 CX3CR1/S100A8/S100A9/S100A12/USP15/…
GOBP_TELOMERE_MAINTENANCE_VIA_RECOMBINATION -1.856567 -0.7685224 0.0013664 0.0737555 0.4550599 13 BRCA2/SMC6/SMC5/RAD50/ERCC4
GOBP_SPHINGOMYELIN_CATABOLIC_PROCESS -1.711820 -0.9186817 0.0010558 0.0646364 0.4550599 5 PRKCD/SMPDL3A
GOBP_NEGATIVE_REGULATION_OF_PROTEIN_LOCALIZATION_TO_NUCLEUS -1.883167 -0.7081220 0.0007839 0.0539573 0.4772708 20 LZTS2/ILRUN/NF1/FBXO4/LATS2/…
GOBP_RNA_PHOSPHODIESTER_BOND_HYDROLYSIS_ENDONUCLEOLYTIC -1.794260 -0.5300054 0.0005956 0.0446348 0.4772708 61 POP4/DBR1/POP5/RPP30/LACTB2/…
GOBP_ORGANELLE_TRANSPORT_ALONG_MICROTUBULE -1.908485 -0.5737035 0.0001572 0.0206867 0.5188481 54 FBXW11/BLOC1S2/KIF5B/BLOC1S1/RHOT1/…
GOBP_VESICLE_TRANSPORT_ALONG_MICROTUBULE -2.019859 -0.6825127 0.0001058 0.0158803 0.5384341 31 FBXW11/BLOC1S2/KIF5B/BLOC1S1/BLOC1S3/…

11.3.3.2.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOBP_GLIAL_CELL_DEVELOPMENT 2.456649 0.7731897 0.0000002 0.0013220 0.6901325 27 S100A8/S100A9/CLU/PHGDH/ILK/…
GOBP_DEFENSE_RESPONSE_TO_FUNGUS 2.395708 0.8599855 0.0000007 0.0024346 0.6594444 16 S100A8/S100A9/S100A12/SPON2
GOBP_PEPTIDE_ANTIGEN_ASSEMBLY_WITH_MHC_CLASS_II_PROTEIN_COMPLEX 2.305183 0.8578644 0.0000032 0.0075097 0.6272567 14 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
GOBP_PROTEIN_NITROSYLATION 2.080690 0.9497010 0.0000061 0.0105872 0.6105269 7 S100A8/S100A9/ATP2B4
GOBP_POSITIVE_REGULATION_OF_NF_KAPPAB_TRANSCRIPTION_FACTOR_ACTIVITY 2.054708 0.5639239 0.0000115 0.0160569 0.5933255 53 S100A8/S100A9/S100A12/CLU/TRIM14
GOBP_GLIAL_CELL_DIFFERENTIATION 2.126475 0.5973731 0.0000190 0.0217074 0.5756103 48 S100A8/S100A9/CLU/PHGDH/ILK/…
GOBP_ASTROCYTE_DEVELOPMENT 2.105190 0.8857794 0.0000515 0.0359789 0.5573322 9 S100A8/S100A9/IFNGR1/GRN
GOBP_GLIOGENESIS 1.953508 0.5294465 0.0000452 0.0359789 0.5573322 58 S100A8/S100A9/CLU/PHGDH/ILK/…
GOBP_HUMORAL_IMMUNE_RESPONSE 1.934492 0.5344818 0.0001599 0.0863855 0.5188481 50 S100A9/S100A12/CLU/POU2F2/HLA-DRB1/…
GOBP_DEFENSE_RESPONSE_TO_BACTERIUM 1.853923 0.4865660 0.0001606 0.0863855 0.5188481 68 S100A8/S100A9/MPEG1/S100A12/RNASE2/…
GOBP_REGULATION_OF_NON_CANONICAL_WNT_SIGNALING_PATHWAY -1.563181 -0.9859825 0.0000807 0.0512741 0.5384341 3 DAB2/RNF213
GOBP_PROTEIN_DENEDDYLATION -1.800583 -0.8718514 0.0000467 0.0359789 0.5573322 9 TOR1A/COPS7A/COPS2/COPS3/COPS5

11.3.3.2.3.6 cluster_7
pathway NES ES pval padj log2err size leadingEdge
GOBP_ADAPTIVE_IMMUNE_RESPONSE 2.048789 0.6588388 0.0000000 0.0000000 1.1778933 285 TRBV24-1/IGKV1-9/JCHAIN/TRBV7-6/IGHA1/…
GOBP_IMMUNE_RESPONSE 1.683739 0.5271378 0.0000000 0.0000000 1.1690700 844 GNLY/TRBV24-1/IGKV1-9/GZMB/JCHAIN/…
GOBP_RESPONSE_TO_BACTERIUM 1.744615 0.5633550 0.0000000 0.0000097 0.7614608 255 GNLY/JCHAIN/IGHA1/IGKC/FOS/…
GOBP_DEFENSE_RESPONSE_TO_BACTERIUM 1.915474 0.6553466 0.0000014 0.0014097 0.6435518 97 GNLY/JCHAIN/IGHA1/IGKC/SPON2/…
GOBP_INNATE_IMMUNE_RESPONSE 1.515235 0.4824900 0.0000010 0.0014097 0.6435518 397 GZMB/JCHAIN/IGHA1/IGKC/IFITM3/…
GOBP_HUMORAL_IMMUNE_RESPONSE 1.948609 0.6758531 0.0000048 0.0038839 0.6105269 81 GNLY/JCHAIN/IGHA1/IGKC/IGLL5/…
GOBP_ANTIMICROBIAL_HUMORAL_RESPONSE 2.032802 0.8076655 0.0000449 0.0297954 0.5573322 29 GNLY/JCHAIN/IGHA1/SPON2/IGHA2/…
GOBP_CELL_KILLING 1.832861 0.6310662 0.0000511 0.0311168 0.5573322 90 GNLY/GZMB/TYROBP/ARG1/CX3CR1/…
GOBP_COMPLEMENT_ACTIVATION 1.966954 0.8112968 0.0001437 0.0807669 0.5188481 22 IGHA1/IGKC/IGLL5/IGHA2/IGHM/…
GOBP_POSITIVE_REGULATION_OF_RESPIRATORY_BURST 1.633790 0.9845274 0.0001799 0.0938862 0.5188481 4 JCHAIN/IGHA1/IGHA2

11.3.3.3 GO_MF

11.3.3.3.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.3.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.3.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.3.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOMF_SINGLE_STRANDED_DNA_BINDING 2.116148 0.6497663 0.0000000 0.0000606 0.7195128 111 MCM4/MCM2/MCM7/CDC45/HMGB2/…
GOMF_PEPTIDE_ANTIGEN_BINDING 2.047347 0.7822064 0.0000079 0.0044104 0.5933255 31 TRBV7-9/TRBV28/TRAV8-4/TRAV29DV5/TRAV12-1/…
GOMF_MHC_PROTEIN_BINDING 2.024207 0.7513247 0.0000267 0.0082618 0.5756103 37 TRBV7-9/CD8B/TRAV8-4/TRAV29DV5/TUBB4B/…
GOMF_SINGLE_STRANDED_DNA_HELICASE_ACTIVITY 1.977464 0.8021799 0.0000445 0.0082618 0.5573322 23 MCM4/MCM2/MCM7/RAD51/MCM5/…
GOMF_ANTIGEN_BINDING 1.843091 0.5625983 0.0000295 0.0082618 0.5756103 118 TRBV7-9/TRBV28/TRAV8-4/TRAV29DV5/TRAV12-1/…
GOMF_DNA_HELICASE_ACTIVITY 1.936528 0.6414983 0.0000732 0.0110135 0.5384341 67 MCM4/MCM2/MCM7/RAD51/MCM5/…
GOMF_DNA_REPLICATION_ORIGIN_BINDING 1.974101 0.8630608 0.0001478 0.0189926 0.5188481 15 MCM2/CDC45/MCM10/MCM5/CDC6/…
GOMF_CARBOXYLIC_ACID_BINDING 1.842536 0.6207618 0.0001707 0.0203563 0.5188481 64 PTGDS/TYMS/DHFR/FABP5/SHMT2/…
GOMF_ISOPRENOID_BINDING 1.836143 0.9050190 0.0005299 0.0465767 0.4772708 9 PTGDS/FABP5
GOMF_DNA_SECONDARY_STRUCTURE_BINDING 1.824337 0.6877532 0.0011219 0.0697218 0.4550599 33 CLSPN/HMGB2/RAD51AP1/RBBP8/HMGB3/…
GOMF_RNA_POLYMERASE_II_C_TERMINAL_DOMAIN_PHOSPHOSERINE_BINDING -1.656665 -0.9181357 0.0011874 0.0697218 0.4550599 5 PCIF1/RTF1
GOMF_RNA_POLYMERASE_II_C_TERMINAL_DOMAIN_BINDING -1.889651 -0.9095411 0.0000791 0.0110135 0.5384341 8 PCIF1/SCAF1/RTF1

11.3.3.3.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOMF_IMMUNOGLOBULIN_RECEPTOR_BINDING 2.515320 0.7591970 0.0000000 0.0000000 0.9101197 61 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOMF_ANTIGEN_BINDING 2.185833 0.5928714 0.0000000 0.0000006 0.8012156 125 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOMF_OXIDOREDUCTASE_ACTIVITY 1.497780 0.3594296 0.0001513 0.0628451 0.5188481 412 MT-ND2/MTHFD2/SRD5A1/CHCHD4/MT-CYB/…
GOMF_STRUCTURAL_CONSTITUENT_OF_RIBOSOME -2.003018 -0.5045388 0.0000067 0.0037351 0.6105269 150 RPL30/MRPL1/MRPS31/MRPL46/RPS5/…

11.3.3.3.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOMF_CYTOKINE_ACTIVITY 2.001337 0.6480582 0.0000289 0.0240666 0.5756103 43 CMTM8/CD40LG/CCL5/CXCL13/MIF/…
GOMF_ANTIGEN_BINDING 1.855698 0.5098333 0.0000216 0.0240666 0.5756103 111 HLA-DQB1/IGKV2-30/IGHV3-49/LAG3/IGLV2-23/…
GOMF_SIGNALING_RECEPTOR_REGULATOR_ACTIVITY 1.794035 0.5069550 0.0002520 0.0666386 0.4984931 95 CMTM8/CD40LG/AREG/CCL5/EDA/…
GOMF_NUCLEAR_RECEPTOR_COACTIVATOR_ACTIVITY -1.870840 -0.5582674 0.0004124 0.0858290 0.4984931 46 BRD8/TSG101/NCOA1/ETS1/PKN1
GOMF_GLUTATHIONE_TRANSMEMBRANE_TRANSPORTER_ACTIVITY -1.620461 -0.9829291 0.0002802 0.0666386 0.4984931 3 ABCC1
GOMF_TRIPEPTIDE_TRANSMEMBRANE_TRANSPORTER_ACTIVITY -1.620461 -0.9829291 0.0002802 0.0666386 0.4984931 3 ABCC1
GOMF_PEPTIDE_TRANSMEMBRANE_TRANSPORTER_ACTIVITY -1.888030 -0.9024118 0.0001746 0.0666386 0.5188481 7 ABCC1/TAP1/SLC15A4
GOMF_INTRAMOLECULAR_TRANSFERASE_ACTIVITY -1.958486 -0.7566921 0.0002373 0.0666386 0.5188481 17 DKC1/LSS/TRUB2/TRUB1/PGM2/…

11.3.3.3.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOMF_PEPTIDE_BINDING 1.860633 0.4803554 0.0000038 0.0032306 0.6105269 167 FCGR2B/C1QA/HLA-DRB1/HLA-DPA1/HLA-G/…
GOMF_MHC_CLASS_II_PROTEIN_COMPLEX_BINDING 2.161612 0.7860689 0.0000106 0.0044716 0.5933255 23 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/HLA-DQA1/…
GOMF_MHC_PROTEIN_COMPLEX_BINDING 2.094137 0.7297481 0.0000199 0.0062596 0.5756103 29 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/HLA-DQA1/…
GOMF_IMMUNE_RECEPTOR_ACTIVITY 1.940081 0.5743769 0.0000496 0.0107538 0.5573322 66 CX3CR1/FCGR3A/HLA-DRB1/HLA-DPA1/HLA-DQB1/…
GOMF_CHEMOKINE_RECEPTOR_BINDING 2.044113 0.7540300 0.0000934 0.0174540 0.5384341 22 CXCL10/CX3CR1/CXCL9/CCL3L3/CCL4/…
GOMF_LONG_CHAIN_FATTY_ACID_BINDING 1.893833 0.8795587 0.0001511 0.0253955 0.5188481 9 S100A8/ALOX5AP/S100A9/STX3
GOMF_MONOCARBOXYLIC_ACID_BINDING 1.958639 0.6566511 0.0002021 0.0308884 0.5188481 34 S100A8/ALOX5AP/S100A9/RBP7/GSTM2/…
GOMF_FATTY_ACID_BINDING 1.988337 0.7334556 0.0002265 0.0317318 0.5188481 22 S100A8/ALOX5AP/S100A9/RBP7/GSTM2/…
GOMF_TOLL_LIKE_RECEPTOR_BINDING 1.880313 0.7989108 0.0007662 0.0677905 0.4772708 12 S100A8/S100A9/TLR1
GOMF_CHEMOKINE_ACTIVITY 1.840349 0.7633174 0.0013191 0.0852833 0.4550599 14 CXCL10/CXCL9/CCL4/CXCL11/CXCL8/…
GOMF_MUSCLE_ALPHA_ACTININ_BINDING -1.694952 -0.8518713 0.0018847 0.0990080 0.4550599 7 PDLIM2/PKD2L1/PDLIM5
GOMF_GTPASE_INHIBITOR_ACTIVITY -1.739581 -0.7999095 0.0012166 0.0830559 0.4550599 10 IQGAP2/RHOH/IPO5/IQGAP1
GOMF_ENDORIBONUCLEASE_ACTIVITY -1.745404 -0.5422953 0.0012352 0.0830559 0.4550599 50 POP4/DBR1/POP5/RPP30/LACTB2/…
GOMF_STRUCTURAL_MOLECULE_ACTIVITY_CONFERRING_ELASTICITY -1.710554 -0.9407178 0.0004029 0.0403182 0.4984931 5 EMILIN2/LAMC1
GOMF_GAMMA_TUBULIN_BINDING -1.916167 -0.6931893 0.0003459 0.0403182 0.4984931 24 LYN/BRCA2/BLOC1S2/WASHC1/TUBGCP6/…

11.3.3.3.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOMF_IMMUNE_RECEPTOR_ACTIVITY 2.371653 0.6794128 0.0000004 0.0005664 0.6749629 41 HLA-DQA1/IFNGR2/FCGR1A/HLA-DRB1/HLA-DPA1/…
GOMF_MHC_CLASS_II_PROTEIN_COMPLEX_BINDING 2.283245 0.7498025 0.0000181 0.0134317 0.5756103 23 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/CD4/…
GOMF_MHC_CLASS_II_RECEPTOR_ACTIVITY 2.255942 0.8883127 0.0000690 0.0290381 0.5384341 9 HLA-DQA1/HLA-DRB1/HLA-DPA1/HLA-DQB1/HLA-DRA/…
GOMF_RAGE_RECEPTOR_BINDING 1.935802 0.9524268 0.0000782 0.0290381 0.5384341 5 S100A8/S100A12/HMGB2
GOMF_MHC_PROTEIN_COMPLEX_BINDING 2.180967 0.6837297 0.0002132 0.0527555 0.5188481 26 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/CD4/…
GOMF_ENDORIBONUCLEASE_ACTIVITY_PRODUCING_3_PHOSPHOMONOESTERS 1.723298 0.9815504 0.0002046 0.0527555 0.5188481 3 RNASE2/RNASE1/TSEN34
GOMF_CARBOXYLIC_ACID_BINDING 2.066286 0.6265775 0.0002909 0.0617214 0.4984931 30 S100A8/S100A9/PTGDS/SLC19A1/SCP2/…
GOMF_SEQUENCE_SPECIFIC_MRNA_BINDING -1.674101 -0.9327355 0.0003886 0.0721290 0.4984931 5 DHX9/ETF1

11.3.3.3.3.6 cluster_7
pathway NES ES pval padj log2err size leadingEdge
GOMF_ANTIGEN_BINDING 2.206274 0.7773854 0.0e+00 0.0000062 0.7614608 65 JCHAIN/IGHA1/IGKC/TRAV8-4/TRBV12-3/…
GOMF_IMMUNOGLOBULIN_RECEPTOR_BINDING 2.061595 0.8908119 3.8e-06 0.0030738 0.6105269 17 JCHAIN/IGHA1/IGKC/IGLL5/IGHA2/…

11.3.3.4 GO_CC

11.3.3.4.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.4.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.4.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.4.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
GOCC_CHROMOSOMAL_REGION 2.153562 0.6019238 0.0000000 0.0000000 1.0073180 294 CENPM/MCM4/CENPN/MCM2/MCM7/…
GOCC_T_CELL_RECEPTOR_COMPLEX 2.463714 0.8062666 0.0000000 0.0000000 0.9653278 73 TRBV7-9/TRBV28/CD8B/TRAV8-4/TRBV30/…
GOCC_PLASMA_MEMBRANE_SIGNALING_RECEPTOR_COMPLEX 2.345398 0.7105977 0.0000000 0.0000000 0.9436322 132 TRBV7-9/TRBV28/CD8B/TRAV8-4/TRBV30/…
GOCC_RECEPTOR_COMPLEX 2.190003 0.6324329 0.0000000 0.0000000 0.8986712 201 TRBV7-9/TRBV28/CD8B/TRAV8-4/TRBV30/…
GOCC_PLASMA_MEMBRANE_PROTEIN_COMPLEX 2.042605 0.5792467 0.0000000 0.0000000 0.8634154 266 TRBV7-9/TRBV28/CD8B/TRAV8-4/TRBV30/…
GOCC_NUCLEAR_CHROMOSOME 2.121877 0.6186505 0.0000000 0.0000000 0.8140358 180 H3C2/GINS2/MCM4/MCM2/MCM7/…
GOCC_PROTEIN_DNA_COMPLEX 2.184620 0.6481419 0.0000000 0.0000000 0.8140358 150 H3C2/GINS2/MCM2/CDC45/H2AX/…
GOCC_CONDENSED_CHROMOSOME 2.074924 0.5995676 0.0000000 0.0000000 0.8140358 199 CENPM/CENPN/MKI67/CENPU/ZWINT/…
GOCC_DNA_PACKAGING_COMPLEX 2.242180 0.7117612 0.0000000 0.0000001 0.7881868 92 H3C2/NCAPH/SMC4/H2BC11/H2AZ1/…
GOCC_CONDENSED_CHROMOSOME_CENTROMERIC_REGION 2.000353 0.5969363 0.0000003 0.0000236 0.6749629 145 CENPM/CENPN/CENPU/ZWINT/CENPW/…
GOCC_DENDRITE_MEMBRANE -1.673108 -0.9045261 0.0045097 0.0987124 0.4070179 5 ATP2B1/AKAP5
GOCC_CHLORIDE_CHANNEL_COMPLEX -1.725759 -0.8015903 0.0031206 0.0749714 0.4317077 9 OSTM1/TTYH2/CLCC1
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT -1.837411 -0.5694257 0.0023245 0.0620873 0.4317077 50 RPL6/RPL36A/RPL10A/RPL5/RPL36/…
GOCC_POLYSOMAL_RIBOSOME -1.838438 -0.6344751 0.0023322 0.0620873 0.4317077 29 RPL6/RPL36A/RPL10A/RPL36/RPS23/…
GOCC_INTEGRAL_COMPONENT_OF_LYSOSOMAL_MEMBRANE -1.657898 -0.9489220 0.0013453 0.0456942 0.4550599 4 SLC46A3
GOCC_CYTOSOLIC_RIBOSOME -1.697702 -0.4786446 0.0011482 0.0403915 0.4550599 91 RPL6/RPL36A/RPL10A/RPL5/RPL36/…
GOCC_RIBOSOME -1.552780 -0.3895046 0.0008887 0.0336693 0.4772708 194 MPV17L2/MRPL58/RPL6/MRPL42/RPL36A/…
GOCC_POLYSOME -1.865676 -0.5621103 0.0008361 0.0329405 0.4772708 58 RPL6/DIS3L2/RPL36A/RPL10A/RPL36/…
GOCC_INTRINSIC_COMPONENT_OF_VACUOLAR_MEMBRANE -1.838011 -0.8537298 0.0005833 0.0249817 0.4772708 9 SLC46A3

11.3.3.4.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
GOCC_IMMUNOGLOBULIN_COMPLEX 2.728632 0.7431779 0.0000000 0.0000000 1.2378967 122 IGHV5-10-1/IGHV3-49/IGKV6-21/IGLV4-3/IGLV5-37/…
GOCC_IMMUNOGLOBULIN_COMPLEX_CIRCULATING 2.492987 0.7658562 0.0000000 0.0000000 0.9325952 60 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE 1.770614 0.4556816 0.0000053 0.0010405 0.6105269 204 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOCC_CELL_SURFACE 1.637783 0.4000528 0.0000051 0.0010405 0.6105269 347 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOCC_SIDE_OF_MEMBRANE 1.517654 0.3731209 0.0001565 0.0140818 0.5188481 329 IGHV5-10-1/IGHV3-49/IGHV3-66/IGHV1-69-2/IGHV6-1/…
GOCC_RIBOSOME -1.695147 -0.4193540 0.0001520 0.0140818 0.5188481 190 RPL30/MRPL1/MRPL46/MRPL15/RPS5/…
GOCC_RIBONUCLEOPROTEIN_COMPLEX -1.477137 -0.3269321 0.0000212 0.0023274 0.5756103 594 RPL30/MRPL1/EFTUD2/EBNA1BP2/USP39/…
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT -2.140987 -0.6615629 0.0000129 0.0016015 0.5933255 49 MRPL1/RPL22/RPL17/RPL13/RPL29/…
GOCC_RIBOSOMAL_SUBUNIT -1.875132 -0.4750649 0.0000103 0.0014601 0.5933255 160 RPL30/MRPL1/MRPL46/MRPL15/RPS5/…
GOCC_LARGE_RIBOSOMAL_SUBUNIT -2.088079 -0.5647408 0.0000098 0.0014601 0.5933255 103 RPL30/MRPL1/MRPL46/MRPL15/RPL22/…
GOCC_CYTOSOLIC_RIBOSOME -2.118784 -0.5902958 0.0000023 0.0007649 0.6272567 89 RPL30/MRPL1/RPS5/RPL22/RPL35/…

11.3.3.4.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
GOCC_T_CELL_RECEPTOR_COMPLEX 1.903663 0.5366106 0.0000400 0.0195423 0.5573322 94 TRBV18/TRBV10-1/TRAV25/TRBV11-1/TRAV5/…
GOCC_PLASMA_MEMBRANE_SIGNALING_RECEPTOR_COMPLEX 1.767999 0.4695008 0.0000340 0.0195423 0.5573322 150 TRBV18/TRAV27/TRBV10-1/TRAV25/TRBV11-1/…
GOCC_IMMUNOGLOBULIN_COMPLEX 1.885940 0.5464453 0.0001772 0.0577721 0.5188481 77 IGKV2-30/IGHV3-49/IGLV2-23/IGLV4-69/IGKV1-16/…
GOCC_RECEPTOR_COMPLEX 1.606175 0.4050069 0.0002639 0.0645330 0.4984931 218 TRBV18/TRAV27/TRBV10-1/TRAV25/TRBV11-1/…
GOCC_PLASMA_MEMBRANE_PROTEIN_COMPLEX 1.555189 0.3821801 0.0004614 0.0902469 0.4984931 279 TRBV18/TRAV27/HLA-DQB1/TRBV10-1/TRAV25/…

11.3.3.4.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
GOCC_SIDE_OF_MEMBRANE 1.846697 0.4466567 0.0000000 0.0000207 0.7337620 308 CXCL10/CX3CR1/FCGR2B/FCGR3A/CXCL9/…
GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE 1.909978 0.4893222 0.0000002 0.0000913 0.6901325 192 CXCL10/CX3CR1/ITGA4/FCGR2B/CD86/…
GOCC_MHC_CLASS_II_PROTEIN_COMPLEX 2.007781 0.8348291 0.0000609 0.0100325 0.5573322 14 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/HLA-DQA1/…
GOCC_ENDOPLASMIC_RETICULUM_CHAPERONE_COMPLEX 1.948849 0.8805825 0.0000531 0.0100325 0.5573322 10 SDF2L1/DNAJB11/P4HB/DNAJC10/PDIA6/…
GOCC_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX 1.822406 0.5069833 0.0000814 0.0100595 0.5384341 98 S100A8/C1QA/C1QB/S100A9/C1QC/…
GOCC_MHC_PROTEIN_COMPLEX 1.930571 0.7173907 0.0003378 0.0303714 0.4984931 22 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/HLA-DQB1/…
GOCC_ENDOCYTIC_VESICLE_LUMEN 1.895734 0.8172113 0.0003854 0.0317595 0.4984931 12 HP/CALR/APOA1/HYOU1/CTSL/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE_MEMBRANE 1.868321 0.5999068 0.0004537 0.0345193 0.4984931 41 HLA-DRB1/CD9/HLA-DPA1/HLA-DQB1/HLA-DQA1/…
GOCC_PROTEIN_LIPID_COMPLEX 1.926354 0.7810465 0.0005202 0.0367516 0.4772708 16 DBI/APOA1/MSR1/BIN1/APOO/…
GOCC_LUMENAL_SIDE_OF_ENDOPLASMIC_RETICULUM_MEMBRANE 1.812523 0.6635407 0.0015532 0.0808476 0.4550599 25 HLA-DRB1/HLA-DPA1/HLA-G/HLA-DQB1/HLA-DQA1/…
GOCC_CYTOSOLIC_RIBOSOME -1.737803 -0.4781041 0.0007586 0.0494222 0.4772708 89 RPL23/RPL18A/RPL29/RPL18/RPL36AL/…
GOCC_CYTOSOLIC_LARGE_RIBOSOMAL_SUBUNIT -1.844486 -0.5587320 0.0008101 0.0494222 0.4772708 50 RPL23/RPL18A/RPL29/RPL18/RPL36AL/…

11.3.3.4.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
GOCC_MHC_CLASS_II_PROTEIN_COMPLEX 2.370310 0.8615979 0.0000035 0.0019067 0.6272567 15 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE_MEMBRANE 2.341537 0.7377665 0.0000041 0.0019067 0.6105269 26 HLA-DQA1/FCGR1A/HLA-DRB1/HLA-DPB1/HLA-DPA1/…
GOCC_CLATHRIN_COATED_ENDOCYTIC_VESICLE 2.245486 0.6889069 0.0000300 0.0069595 0.5756103 30 HLA-DQA1/CLTA/FCGR1A/HLA-DRB1/HLA-DPB1/…
GOCC_CLATHRIN_COATED_VESICLE_MEMBRANE 2.007773 0.5721868 0.0002714 0.0493861 0.4984931 38 HLA-DQA1/CLTA/FCGR1A/HLA-DRB1/HLA-DPB1/…
GOCC_MHC_PROTEIN_COMPLEX 2.063584 0.6905143 0.0005325 0.0585630 0.4772708 20 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
GOCC_EXTERNAL_ENCAPSULATING_STRUCTURE 1.756124 0.4522910 0.0006322 0.0585630 0.4772708 64 S100A8/S100A9/CLU/VCAN/CTSH/…
GOCC_ANCHORED_COMPONENT_OF_PLASMA_MEMBRANE 1.704845 0.9721144 0.0008103 0.0626606 0.4772708 3 CD14/CD2
GOCC_CLATHRIN_COATED_VESICLE 1.691293 0.4355936 0.0015421 0.0954059 0.4550599 64 HLA-DQA1/CLTA/FCGR1A/HLA-DRB1/HLA-DPB1/…
GOCC_INTRACELLULAR_PROTEIN_CONTAINING_COMPLEX -1.394951 -0.4214633 0.0010681 0.0729824 0.4550599 299 TMEM183A/UBE2D2/PCGF5/PSMD4/PAXX/…
GOCC_EXTRINSIC_COMPONENT_OF_ENDOPLASMIC_RETICULUM_MEMBRANE -1.569832 -0.9305318 0.0011010 0.0729824 0.4550599 4 TOR1A/PML/CCDC115
GOCC_COP9_SIGNALOSOME -1.719149 -0.7984972 0.0006942 0.0585630 0.4772708 11 COPS9/COPS7A/COPS2/COPS3/COPS5
GOCC_U5_SNRNP -1.776350 -0.7818248 0.0003193 0.0493861 0.4984931 14 SNRPD1/SNRNP40/SNRPG/SNRPN/SNRPD3
GOCC_CATALYTIC_COMPLEX -1.373552 -0.4039981 0.0000211 0.0065218 0.5756103 659 TMEM183A/SNRPD1/UBE2D2/PCGF5/SNRNP40/…

11.3.3.4.3.6 cluster_7
pathway NES ES pval padj log2err size leadingEdge
GOCC_T_CELL_RECEPTOR_COMPLEX 2.428293 0.8660192 0.00e+00 0.0000000 0.9759947 55 TRBV24-1/TRBV7-6/TRBV7-2/TRAV3/TRAV5/…
GOCC_PLASMA_MEMBRANE_PROTEIN_COMPLEX 2.024855 0.6560111 0.00e+00 0.0000000 0.9759947 205 TRBV24-1/TRBV7-6/TRBV7-2/TRAV3/TRAV5/…
GOCC_PLASMA_MEMBRANE_SIGNALING_RECEPTOR_COMPLEX 2.224667 0.7555429 0.00e+00 0.0000000 0.9325952 97 TRBV24-1/TRBV7-6/TRBV7-2/TRAV3/TRAV20/…
GOCC_RECEPTOR_COMPLEX 2.140080 0.7094074 0.00e+00 0.0000000 0.9325952 135 TRBV24-1/TRBV7-6/TRBV7-2/TRAV3/TRAV5/…
GOCC_IMMUNOGLOBULIN_COMPLEX 2.270549 0.8803607 0.00e+00 0.0000019 0.7477397 29 IGKV1-9/JCHAIN/IGHA1/IGKC/IGKV1-6/…
GOCC_IMMUNOGLOBULIN_COMPLEX_CIRCULATING 2.116352 0.9200841 2.00e-07 0.0000295 0.6901325 15 JCHAIN/IGHA1/IGKC/IGLL5/IGHA2/…
GOCC_IGA_IMMUNOGLOBULIN_COMPLEX 1.673127 0.9869494 5.00e-06 0.0006066 0.6105269 4 JCHAIN/IGHA1/IGHA2/IGKV3-20
GOCC_EXTERNAL_SIDE_OF_PLASMA_MEMBRANE 1.764678 0.5820632 5.70e-06 0.0006079 0.6105269 147 IGHA1/IGKC/CX3CR1/FCRL6/TRGV5/…
GOCC_BLOOD_MICROPARTICLE 2.035826 0.7727473 1.06e-05 0.0010199 0.5933255 34 JCHAIN/IGHA1/IGKC/IGHA2/IGKV3-20/…
GOCC_CELL_SURFACE 1.546065 0.4966569 3.60e-05 0.0031596 0.5573322 259 IGHA1/IGKC/TYROBP/CX3CR1/FCRL6/…

11.3.3.5 CP_REACTOME

11.3.3.5.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.5.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.5.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.5.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
REACTOME_CELL_CYCLE 2.336511 0.6167624 0.0000000 0.0000000 1.5092628 606 TYMS/H3C2/GINS2/CENPM/MCM4/…
REACTOME_CELL_CYCLE_MITOTIC 2.416861 0.6461838 0.0000000 0.0000000 1.4675240 476 TYMS/H3C2/GINS2/CENPM/MCM4/…
REACTOME_DNA_REPLICATION 2.426995 0.7126979 0.0000000 0.0000000 1.0864405 165 H3C2/GINS2/MCM4/MCM2/MCM7/…
REACTOME_CELL_CYCLE_CHECKPOINTS 2.321792 0.6551597 0.0000000 0.0000000 1.0864405 239 CENPM/MCM4/CENPN/MCM2/CLSPN/…
REACTOME_DNA_REPLICATION_PRE_INITIATION 2.480731 0.7554312 0.0000000 0.0000000 1.0574636 125 H3C2/MCM4/MCM2/MCM7/CDC45/…
REACTOME_MITOTIC_G1_PHASE_AND_G1_S_TRANSITION 2.452254 0.7503150 0.0000000 0.0000000 1.0574636 122 TYMS/MCM4/TK1/RRM2/MCM2/…
REACTOME_RESOLUTION_OF_SISTER_CHROMATID_COHESION 2.257934 0.6973454 0.0000000 0.0000000 0.8513391 110 CENPM/CENPN/CENPU/ZWINT/CDC20/…
REACTOME_ACTIVATION_OF_ATR_IN_RESPONSE_TO_REPLICATION_STRESS 2.305606 0.8866198 0.0000000 0.0000000 0.8140358 29 MCM4/MCM2/CLSPN/MCM7/CDC45/…
REACTOME_DEPOSITION_OF_NEW_CENPA_CONTAINING_NUCLEOSOMES_AT_THE_CENTROMERE 2.278048 0.7832954 0.0000000 0.0000000 0.8140358 56 CENPM/CENPN/CENPU/CENPW/H2AX/…
REACTOME_MEIOTIC_RECOMBINATION 2.294077 0.7767556 0.0000000 0.0000001 0.8012156 63 H3C2/RAD51/H2AX/RBBP8/H2AC14/…
REACTOME_ROS_AND_RNS_PRODUCTION_IN_PHAGOCYTES -1.625722 -0.5977521 0.0080684 0.0851781 0.3807304 25 ATP6V1E2/NCF4/ATP6V0C/ATP6V1E1/NOS3/…
REACTOME_SUMOYLATION_OF_SUMOYLATION_PROTEINS -1.639650 -0.5492923 0.0060783 0.0673323 0.4070179 35 NUP188/NUP88/NUP50/NUP35/NUP205/…
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -1.539416 -0.4243047 0.0047827 0.0549138 0.4070179 99 RPL6/RPL36A/RPL10A/RPL5/RPL36/…
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION -1.537413 -0.4301516 0.0043859 0.0518722 0.4070179 94 RPL6/RPL36A/RPL10A/RPL5/RPL36/…
REACTOME_BETA_OXIDATION_OF_VERY_LONG_CHAIN_FATTY_ACIDS -1.726972 -0.7958374 0.0027579 0.0349849 0.4317077 10 ECI2/DECR2/EHHADH/ACOX1
REACTOME_ROLE_OF_ABL_IN_ROBO_SLIT_SIGNALING -1.612813 -0.9462357 0.0021098 0.0286094 0.4317077 4 CAP1/CLASP2
REACTOME_SLC_TRANSPORTER_DISORDERS -1.823149 -0.5554837 0.0011372 0.0165625 0.4550599 53 SLC9A6/NUP188/NUP88/SLC4A4/SLC22A18/…
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -1.777327 -0.5042382 0.0004321 0.0073884 0.4984931 86 RPL6/RPL36A/RPL10A/EEF1G/RPL5/…
REACTOME_RRNA_PROCESSING -1.634518 -0.4149927 0.0001770 0.0033145 0.5188481 191 RPL6/TBL3/ISG20L2/RPL36A/RPL10A/…
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -1.790883 -0.4936929 0.0001126 0.0022410 0.5384341 107 SRP9/RPL6/TRAM1/RPL36A/RPL10A/…

11.3.3.5.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
REACTOME_COMPLEMENT_CASCADE 2.009910 0.6029164 0.0000050 0.0025781 0.6105269 66 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/CFD/…
REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS 1.975443 0.5805936 0.0000048 0.0025781 0.6105269 78 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_CD22_MEDIATED_BCR_REGULATION 1.979083 0.6131349 0.0000169 0.0030933 0.5756103 57 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_ROLE_OF_LAT2_NTAL_LAB_ON_CALCIUM_MOBILIZATION 1.973968 0.6000759 0.0000171 0.0030933 0.5756103 62 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_FCERI_MEDIATED_CA_2_MOBILIZATION 1.964366 0.5798315 0.0000106 0.0030933 0.5933255 76 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_FCERI_MEDIATED_MAPK_ACTIVATION 1.940688 0.5775116 0.0000129 0.0030933 0.5933255 72 IGHV1-46/IGKV1D-16/JUN/IGLV2-23/IGKV3D-20/…
REACTOME_ROLE_OF_PHOSPHOLIPIDS_IN_PHAGOCYTOSIS 1.944950 0.5805449 0.0000230 0.0032541 0.5756103 71 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 1.954073 0.6053867 0.0000281 0.0036550 0.5756103 57 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/CFD/…
REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 1.943205 0.6062028 0.0000476 0.0049445 0.5573322 54 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_FCGR_ACTIVATION 1.899748 0.5826955 0.0000732 0.0060082 0.5384341 60 IGHV1-46/IGKV1D-16/IGLV2-23/IGKV3D-20/IGHV4-34/…
REACTOME_REGULATION_BY_C_FLIP -1.904890 -0.8620732 0.0009415 0.0432820 0.4772708 9 TNFSF10
REACTOME_CASPASE_ACTIVATION_VIA_DEATH_RECEPTORS_IN_THE_PRESENCE_OF_LIGAND -1.910440 -0.7941829 0.0009439 0.0432820 0.4772708 12 TNFSF10
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -1.939184 -0.5161508 0.0001256 0.0093251 0.5188481 99 RPL30/RPS5/RPL22/RPL35/RPL17/…
REACTOME_NONSENSE_MEDIATED_DECAY_NMD -2.000480 -0.5211696 0.0001032 0.0080426 0.5384341 111 RPL30/RPS5/RPL22/GSPT2/RPL35/…
REACTOME_CELLULAR_RESPONSE_TO_STARVATION -1.847517 -0.4701529 0.0000642 0.0058851 0.5384341 137 RPL30/NPRL3/FNIP1/RPS5/RPL22/…
REACTOME_INFLUENZA_INFECTION -1.841868 -0.4609878 0.0000590 0.0057483 0.5573322 150 RPL30/NUP93/RPS5/NUP98/RPL22/…
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -2.078395 -0.5672509 0.0000340 0.0040765 0.5573322 87 RPL30/RPS5/RPL22/RPL35/RPL17/…
REACTOME_RRNA_PROCESSING -1.857234 -0.4566138 0.0000124 0.0030933 0.5933255 189 RPL30/EBNA1BP2/WDR12/RPP40/TRMT10C/…
REACTOME_SELENOAMINO_ACID_METABOLISM -2.010487 -0.5373524 0.0000198 0.0030933 0.5756103 98 RPL30/RPS5/RPL22/RPL35/RPL17/…
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -2.025702 -0.5315205 0.0000194 0.0030933 0.5756103 108 RPL30/RPS5/RPN2/RPL22/RPL35/…

11.3.3.5.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM 1.649233 0.3886906 0.0000002 0.0003546 0.6901325 485 IFITM3/HLA-DQB1/SOCS2/PIM1/IFNGR2/…
REACTOME_SIGNALING_BY_INTERLEUKINS 1.604933 0.3967749 0.0000201 0.0156992 0.5756103 301 SOCS2/PIM1/FOS/CISH/JUN/…
REACTOME_GROWTH_HORMONE_RECEPTOR_SIGNALING 2.009447 0.7962464 0.0000495 0.0258632 0.5573322 17 SOCS2/CISH/SOCS3/SOCS1/STAT1/…
REACTOME_INTERLEUKIN_7_SIGNALING 1.961555 0.7252117 0.0001129 0.0442082 0.5384341 24 SOCS2/CISH/H3C2/SOCS1/H3C10/…

11.3.3.5.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
REACTOME_MHC_CLASS_II_ANTIGEN_PRESENTATION 1.974478 0.5634924 0.0000079 0.0122063 0.5933255 87 TUBB2A/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/…
REACTOME_GPCR_LIGAND_BINDING 1.912075 0.5410320 0.0000226 0.0122063 0.5756103 96 CXCL10/CX3CR1/CXCL9/PTGER2/FPR2/…
REACTOME_PD_1_SIGNALING 2.020237 0.7966822 0.0001194 0.0358340 0.5384341 16 HLA-DRB1/HLA-DPA1/HLA-DQB1/HLA-DQA1/HLA-DPB1/…
REACTOME_CHEMOKINE_RECEPTORS_BIND_CHEMOKINES 1.995526 0.7619693 0.0001373 0.0358340 0.5188481 20 CXCL10/CX3CR1/CXCL9/CCL3L3/CCL4/…
REACTOME_DEPOSITION_OF_NEW_CENPA_CONTAINING_NUCLEOSOMES_AT_THE_CENTROMERE 1.905490 0.6088303 0.0002177 0.0402353 0.5188481 45 CENPX/H2AZ2/H2AC8/H2BC11/MIS18BP1/…
REACTOME_GENERATION_OF_SECOND_MESSENGER_MOLECULES 1.910266 0.6833438 0.0006296 0.0646308 0.4772708 26 HLA-DRB1/HLA-DPA1/HLA-DQB1/HLA-DQA1/HLA-DPB1/…
REACTOME_APC_C_MEDIATED_DEGRADATION_OF_CELL_CYCLE_PROTEINS 1.684110 0.4931344 0.0006350 0.0646308 0.4772708 76 PTTG1/UBE2C/UBE2D1/PSMB3/PSMD3/…
REACTOME_CHROMOSOME_MAINTENANCE 1.680580 0.4689999 0.0005802 0.0646308 0.4772708 106 CENPX/RFC4/H2AZ2/H2AC8/H2BC11/…
REACTOME_PEPTIDE_LIGAND_BINDING_RECEPTORS 1.809106 0.5990433 0.0010158 0.0757483 0.4550599 40 CXCL10/CX3CR1/CXCL9/FPR2/CCL3L3/…
REACTOME_CLASS_A_1_RHODOPSIN_LIKE_RECEPTORS 1.741688 0.5429726 0.0009274 0.0757483 0.4772708 53 CXCL10/CX3CR1/CXCL9/CCL3L3/CYSLTR1/…
REACTOME_EUKARYOTIC_TRANSLATION_INITIATION -1.594646 -0.4515595 0.0014797 0.0965472 0.4550599 94 EIF5/RPL23/RPL18A/RPL29/RPL18/…
REACTOME_IL_6_TYPE_CYTOKINE_RECEPTOR_LIGAND_INTERACTIONS -1.759275 -0.8961184 0.0010663 0.0758989 0.4550599 6 TYK2
REACTOME_HUR_ELAVL1_BINDS_AND_STABILIZES_MRNA -1.778908 -0.8289656 0.0010054 0.0757483 0.4550599 8 TNFSF13/PRKCD
REACTOME_INTERLEUKIN_6_FAMILY_SIGNALING -1.893517 -0.7834492 0.0007116 0.0655468 0.4772708 13 TYK2/CBL
REACTOME_NONSENSE_MEDIATED_DECAY_NMD -1.626813 -0.4475012 0.0006603 0.0646308 0.4772708 110 RPL23/RPL18A/RPL29/RPL36AL/FAU/…
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE -1.684179 -0.4664993 0.0002866 0.0408023 0.4984931 106 SRP72/RPL23/RPL18A/RPL29/RPL18/…
REACTOME_RESPONSE_OF_EIF2AK4_GCN2_TO_AMINO_ACID_DEFICIENCY -1.736423 -0.4873728 0.0002312 0.0402353 0.5188481 98 DDIT3/RPL23/RPL18A/RPL29/RPL18/…
REACTOME_INTERLEUKIN_35_SIGNALLING -1.874964 -0.8737273 0.0001602 0.0358340 0.5188481 8 TYK2
REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION -1.827911 -0.5246989 0.0000465 0.0182012 0.5573322 86 EEF1D/RPL23/RPL18A/RPL29/RPL18/…

11.3.3.5.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
REACTOME_PD_1_SIGNALING 2.359489 0.9174443 0.0000003 0.0005014 0.6749629 12 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/CD4/…
REACTOME_ANTIMICROBIAL_PEPTIDES 2.276290 0.8737225 0.0000049 0.0018192 0.6105269 13 S100A8/S100A9/CLU/CD4
REACTOME_TOLL_LIKE_RECEPTOR_TLR1_TLR2_CASCADE 2.171612 0.6254880 0.0000041 0.0018192 0.6105269 44 S100A8/CD14/S100A9/S100A12/MEF2A/…
REACTOME_DISEASES_OF_IMMUNE_SYSTEM 2.237486 0.8398287 0.0000126 0.0037327 0.5933255 14 S100A8/CD14/S100A9/NFKB1
REACTOME_IRAK4_DEFICIENCY_TLR2_4 2.217217 0.8918020 0.0000153 0.0037688 0.5756103 11 S100A8/CD14/S100A9
REACTOME_REGULATION_OF_TLR_BY_ENDOGENOUS_LIGAND 2.164041 0.9115141 0.0000281 0.0059318 0.5756103 9 S100A8/CD14/S100A9/GSDMD
REACTOME_GENERATION_OF_SECOND_MESSENGER_MOLECULES 2.107443 0.7523408 0.0002823 0.0479045 0.4984931 17 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/CD4/…
REACTOME_TOLL_LIKE_RECEPTOR_CASCADES 1.835710 0.4712314 0.0006201 0.0834250 0.4772708 65 S100A8/CD14/S100A9/S100A12/MEF2A/…
REACTOME_TRANSLATION -1.443926 -0.4391890 0.0004220 0.0624586 0.4984931 236 RPL22L1/AURKAIP1/EIF3A/YARS1/EIF2S3/…
REACTOME_MRNA_SPLICING -1.533618 -0.4797735 0.0002913 0.0479045 0.4984931 153 SNRPD1/PUF60/SNRNP40/SNRPG/CASC3/…

11.3.3.5.3.6 cluster_7
pathway NES ES pval padj log2err size leadingEdge
REACTOME_FCERI_MEDIATED_MAPK_ACTIVATION 2.109754 0.8243360 0.0000097 0.0099276 0.5933255 28 IGHV1-2/FOS/FCER1G/IGKV3-20/IGKV4-1/…
REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL 1.931184 0.6760622 0.0000130 0.0099276 0.5933255 70 IGHV1-2/TRAV8-4/TYROBP/TRBV12-3/KLRG1/…
REACTOME_CD22_MEDIATED_BCR_REGULATION 1.974327 0.8717231 0.0000394 0.0129559 0.5573322 14 IGHV1-2/CD79B/CD79A/IGKV3-20/IGHM/…
REACTOME_CELL_SURFACE_INTERACTIONS_AT_THE_VASCULAR_WALL 1.953504 0.6885833 0.0000310 0.0129559 0.5573322 65 IGHV1-2/JCHAIN/IGHA1/FCER1G/IGHA2/…
REACTOME_SCAVENGING_OF_HEME_FROM_PLASMA 1.937776 0.8842414 0.0000474 0.0129559 0.5573322 12 IGHV1-2/IGHA1/IGHA2/IGKV3-20/IGHV4-59
REACTOME_DNA_DAMAGE_TELOMERE_STRESS_INDUCED_SENESCENCE 1.859313 0.6651506 0.0000508 0.0129559 0.5573322 57 H2BC13/LMNB1/H2BC14/H2BC3/H1-1/…
REACTOME_SENESCENCE_ASSOCIATED_SECRETORY_PHENOTYPE_SASP 1.727124 0.5963017 0.0001164 0.0254469 0.5384341 82 FOS/H2BC13/H2BC14/H2BC3/H2BC11/…
REACTOME_RUNX1_REGULATES_GENES_INVOLVED_IN_MEGAKARYOCYTE_DIFFERENTIATION_AND_PLATELET_FUNCTION 1.780213 0.6322222 0.0002977 0.0569338 0.4984931 61 H2BC13/H2BC14/H2BC3/PRMT1/H2BC11/…
REACTOME_FCGR_ACTIVATION 2.002534 0.8512086 0.0004058 0.0620838 0.4984931 17 IGHV1-2/IGKV3-20/CD247/IGKV4-1/IGHV4-59/…
REACTOME_ACTIVATED_PKN1_STIMULATES_TRANSCRIPTION_OF_AR_ANDROGEN_RECEPTOR_REGULATED_GENES_KLK2_AND_KLK3 1.829496 0.6737836 0.0007073 0.0983737 0.4772708 44 H2BC13/H2BC14/H2BC3/H2BC11/H2AC6/…

11.3.3.6 CP_KEGG

11.3.3.6.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.6.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.6.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.6.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 2.171408 0.6966576 0.0000000 0.0000007 0.7477397 84 H3C2/H2AX/H2AC14/H2BC11/H2AZ1/…
KEGG_CELL_CYCLE 2.141807 0.6693173 0.0000000 0.0000007 0.7614608 106 MCM2/MCM7/PCNA/CDC45/CHEK1/…
KEGG_DNA_REPLICATION 2.209945 0.8330369 0.0000002 0.0000099 0.6901325 30 MCM4/MCM2/MCM7/PCNA/FEN1/…
KEGG_HEMATOPOIETIC_CELL_LINEAGE 1.985047 0.7252280 0.0000903 0.0041978 0.5384341 37 CD8B/CD3G/IL7R/CD3E/CD3D/…
KEGG_ONE_CARBON_POOL_BY_FOLATE 1.939099 0.8656199 0.0001199 0.0044585 0.5384341 14 TYMS/DHFR/SHMT2/SHMT1/MTHFD2/…
KEGG_PRIMARY_IMMUNODEFICIENCY 1.926642 0.7595443 0.0001469 0.0045542 0.5188481 26 CD8B/IL7R/CD3E/CD3D/UNG/…
KEGG_P53_SIGNALING_PATHWAY 1.799546 0.6276497 0.0004796 0.0127436 0.4984931 53 RRM2/CHEK1/GTSE1/BAX/CCNB2/…
KEGG_OOCYTE_MEIOSIS 1.764657 0.5788121 0.0006797 0.0158039 0.4772708 76 PKMYT1/CDC20/PTTG1/YWHAE/SGO1/…
KEGG_GLYCINE_SERINE_AND_THREONINE_METABOLISM 1.713269 0.7798135 0.0044324 0.0716371 0.4070179 13 SHMT2/PSAT1/PSPH/SHMT1/PHGDH/…
KEGG_GLUTATHIONE_METABOLISM 1.700636 0.6373135 0.0046217 0.0716371 0.4070179 32 GSTM1/GGT7/GSTM2/GSR/RRM1/…
KEGG_RIBOSOME -1.642708 -0.4668809 0.0013922 0.0287728 0.4550599 84 RPL6/RPL36A/RPL10A/RPL5/RPL36/…

11.3.3.6.3.2 cluster_2
pathway NES ES pval padj log2err size leadingEdge
KEGG_RIBOSOME -2.058845 -0.5795824 1.04e-05 0.0019406 0.5933255 84 RPL30/RPS5/RPL22/RPL35/RPL17/…

11.3.3.6.3.3 cluster_3
pathway NES ES pval padj log2err size leadingEdge
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 1.852556 0.5516434 0.0001120 0.0207161 0.5384341 71 HLA-DQB1/CD40LG/H3C2/H3C10/H3C11/…
KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 1.767137 0.5097968 0.0002292 0.0212001 0.5188481 90 IFNGR2/CD40LG/TNFRSF18/CCL5/TNFRSF4/…
KEGG_LEISHMANIA_INFECTION 1.777612 0.5759608 0.0010579 0.0652342 0.4550599 44 HLA-DQB1/IFNGR2/FOS/NFKBIA/HLA-DQA1/…
KEGG_JAK_STAT_SIGNALING_PATHWAY 1.705611 0.5024723 0.0015053 0.0696205 0.4550599 75 SOCS2/PIM1/IFNGR2/CISH/SOCS3/…
KEGG_ASTHMA 1.753141 0.7114749 0.0026394 0.0976577 0.4317077 16 HLA-DQB1/CD40LG/HLA-DQA1/IL10/HLA-DRA

11.3.3.6.3.4 cluster_4
pathway NES ES pval padj log2err size leadingEdge
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 2.467692 0.7116934 0.0000000 0.0000000 0.8753251 78 FCGR2B/CD86/C1QA/C1QB/FCGR3A/…
KEGG_HEMATOPOIETIC_CELL_LINEAGE 2.084252 0.6643323 0.0000079 0.0007360 0.5933255 44 ITGA4/HLA-DRB1/CD9/CD37/CD4/…
KEGG_ASTHMA 2.031817 0.8275269 0.0000626 0.0029120 0.5384341 15 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-DQB1/HLA-DQA1/…
KEGG_GRAFT_VERSUS_HOST_DISEASE 1.983153 0.7145278 0.0001142 0.0042481 0.5384341 25 CD86/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/…
KEGG_AUTOIMMUNE_THYROID_DISEASE 1.938784 0.7348842 0.0001764 0.0054671 0.5188481 21 CD86/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/…
KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION 1.879187 0.5763188 0.0002234 0.0059352 0.5188481 51 HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/HLA-DQB1/…
KEGG_TYPE_I_DIABETES_MELLITUS 1.922818 0.6927890 0.0002886 0.0059826 0.4984931 25 CD86/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/…
KEGG_CELL_ADHESION_MOLECULES_CAMS 1.839754 0.5574219 0.0002895 0.0059826 0.4984931 57 ITGA4/CD86/HLA-DRB1/HLA-DMB/HLA-DPA1/…
KEGG_ALLOGRAFT_REJECTION 1.904478 0.6861813 0.0003380 0.0062861 0.4984931 25 CD86/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/…
KEGG_VIRAL_MYOCARDITIS 1.889792 0.6082394 0.0004968 0.0084012 0.4772708 42 CD86/HLA-DRB1/HLA-DMB/HLA-DPA1/HLA-G/…
KEGG_TYPE_II_DIABETES_MELLITUS -1.621156 -0.6346345 0.0069573 0.0862711 0.4070179 19 HK1/PIK3CG/PRKCD/MAPK1/HK3/…
KEGG_RIBOSOME -1.811246 -0.5190738 0.0000417 0.0025854 0.5573322 84 RPL23/RPL18A/RPL29/RPL18/RPL36AL/…

11.3.3.6.3.5 cluster_6
pathway NES ES pval padj log2err size leadingEdge
KEGG_INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION 2.213485 0.7466168 0.0000352 0.0032235 0.5573322 21 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
KEGG_LEISHMANIA_INFECTION 2.115934 0.6404779 0.0000277 0.0032235 0.5756103 34 HLA-DQA1/IFNGR2/FCGR1A/HLA-DRB1/HLA-DPB1/…
KEGG_CELL_ADHESION_MOLECULES_CAMS 1.958053 0.5846113 0.0003231 0.0197077 0.4984931 36 HLA-DQA1/VCAN/HLA-DRB1/HLA-DPB1/HLA-DPA1/…
KEGG_ASTHMA 2.001552 0.7615573 0.0013252 0.0441455 0.4550599 13 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
KEGG_ALLOGRAFT_REJECTION 1.993175 0.6867902 0.0018186 0.0441455 0.4550599 19 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
KEGG_HEMATOPOIETIC_CELL_LINEAGE 1.958328 0.6422632 0.0016543 0.0441455 0.4550599 23 CD14/HLA-DRB1/CD4/CD37/HLA-DRA/…
KEGG_GRAFT_VERSUS_HOST_DISEASE 1.941524 0.6672288 0.0013107 0.0441455 0.4550599 20 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
KEGG_TYPE_I_DIABETES_MELLITUS 1.901411 0.6534435 0.0019299 0.0441455 0.4550599 20 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
KEGG_AUTOIMMUNE_THYROID_DISEASE 1.949401 0.6884058 0.0027751 0.0564267 0.4317077 18 HLA-DQA1/HLA-DRB1/HLA-DPB1/HLA-DPA1/HLA-DQB1/…
KEGG_VIBRIO_CHOLERAE_INFECTION -1.620600 -0.6272288 0.0043339 0.0793103 0.4070179 27 KDELR1/ATP6V1B2/KCNQ1/PDIA4/ATP6V0D1/…

11.3.3.7 TFT_GTRD

11.3.3.7.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.7.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.7.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.7.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
HSD17B8_TARGET_GENES 2.577136 0.6971699 0.0000000 0.0000000 1.6719967 447 GINS2/TK1/RRM2/CENPN/MKI67/…
E2F5_TARGET_GENES 1.594857 0.4167934 0.0000000 0.0000046 0.7337620 874 GINS2/TK1/RRM2/MKI67/MCM2/…
PSMB5_TARGET_GENES 1.840508 0.5245759 0.0000006 0.0000991 0.6594444 229 RRM2/STMN1/CKS2/TUBA1B/H2AX/…
SETD7_TARGET_GENES 1.454402 0.3819497 0.0000877 0.0090553 0.5384341 708 TYMS/RRM2/STMN1/DHFR/MCM7/…
PHF21A_TARGET_GENES 1.694348 0.4944367 0.0002203 0.0189432 0.5188481 181 PTGDS/TK1/RRM2/CDCA7/UHRF1/…
LHX3_TARGET_GENES 1.871275 0.7355436 0.0004250 0.0274101 0.4984931 25 RBBP8/TOP2A/H2BC9/H2AC4/F12/…
AEBP2_TARGET_GENES 1.372416 0.3590654 0.0005077 0.0291097 0.4772708 802 TYMS/CENPM/MCM4/TK1/CENPN/…
POU2AF1_TARGET_GENES 1.381042 0.3645280 0.0010371 0.0486511 0.4550599 620 TYMS/CENPN/CDCA7/IGLV4-69/CDC45/…
ZNF563_TARGET_GENES -1.363622 -0.3213080 0.0016575 0.0712735 0.4550599 353 WDR55/PIH1D1/DGKZ/NUP188/ZNF559/…
SUPT20H_TARGET_GENES -1.207036 -0.2654213 0.0009148 0.0472019 0.4772708 1078 MPV17L2/CCDC191/ZFAS1/TOR1AIP1/DENR/…
DIDO1_TARGET_GENES -1.217403 -0.2632020 0.0004196 0.0274101 0.4984931 1306 ARL1/CCDC191/PMPCB/TOR1AIP1/RPL6/…
HOXA1_TARGET_GENES -1.363433 -0.3021310 0.0000126 0.0016197 0.5933255 864 RANBP3/SLC9A6/ARL1/ACAT1/CCDC191/…

11.3.3.7.3.2 cluster_3
pathway NES ES pval padj log2err size leadingEdge
NKX2_3_TARGET_GENES -1.437627 -0.3235113 0.0002802 0.0481984 0.4984931 358 INTS2/TTC33/ACADSB/M6PR/CSKMT/…
NFRKB_TARGET_GENES -1.250958 -0.2551818 0.0000725 0.0231272 0.5384341 1464 BRD8/MIGA1/RAB7A/AAR2/SRRD/…
ELF2_TARGET_GENES -1.279697 -0.2621921 0.0000896 0.0231272 0.5384341 1169 TSG101/VPS50/RAB7A/RNF115/AAR2/…

11.3.3.7.3.3 cluster_4
pathway NES ES pval padj log2err size leadingEdge
HSD17B8_TARGET_GENES 1.471109 0.3539828 0.0004746 0.0796607 0.4984931 341 BIRC5/MKI67/PTTG1/TROAP/CDKN3/…
DIDO1_TARGET_GENES -1.246076 -0.2675079 0.0010422 0.0796607 0.4550599 1243 WDR77/POP4/MRPL54/SUPT7L/RLF/…
ZNF766_TARGET_GENES -1.284510 -0.2850324 0.0007870 0.0796607 0.4772708 759 NEAT1/LUCAT1/PA2G4/LYN/TMEM115/…
DLX4_TARGET_GENES -1.344985 -0.3022557 0.0005310 0.0796607 0.4772708 615 WDR77/SUPT7L/CSNK1G1/PFKM/SF3B5/…
GLI3_TARGET_GENES -1.345558 -0.3052829 0.0007172 0.0796607 0.4772708 545 SVIP/RLF/R3HDM2/APRT/SMC4/…
RFX7_TARGET_GENES -1.399458 -0.3289083 0.0010828 0.0796607 0.4550599 363 TYK2/WDR77/NEMF/USP48/U2AF2/…
ELF5_TARGET_GENES -1.459133 -0.3441153 0.0004178 0.0796607 0.4984931 341 HNRNPF/SVIP/LYN/UBL7/WDR43/…

11.3.3.8 TFT_TFT_Legacy

11.3.3.8.1 Summaryplot: Covid_Critical

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Critical. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.8.2 Summaryplot: Covid_Mild

Here we plot a summary of the gene sets/pathways that are enriched in genes overexpressed in Covid_Mild. If there are more than 30 significant pathways total, we plot the top 5 per cluster with the lowest adjusted P value.

11.3.3.8.3 Per-cluster pathway results

Here we plot the pathways that are significantly enriched in the differentially expressed genes between Covid_Critical and Covid_Mild for each cluster.

For each condition, we display a table of the GSEA results. NES refers to Normalized Enrichment Score, the main effect size for GSEA. Positive NES indicates the pathway is enriched in Covid_Critical relative to Covid_Mild, and vice-versa.

If a cluster is missing, it indicates no pathways were significantly differentially expressed.

11.3.3.8.3.1 cluster_1
pathway NES ES pval padj log2err size leadingEdge
E2F_Q3 2.224835 0.6517409 0.0000000 0.0000000 0.8753251 165 MCM4/MCM2/STMN1/CDCA7/MCM7/…
E2F1_Q4_01 2.167436 0.6323449 0.0000000 0.0000000 0.8634154 173 MCM4/MCM2/STMN1/CDCA7/MCM7/…
E2F_Q4_01 2.170634 0.6295761 0.0000000 0.0000000 0.8513391 180 MCM4/MCM2/STMN1/CDCA7/MCM7/…
E2F_Q6_01 2.110619 0.6141850 0.0000000 0.0000000 0.8390889 176 MCM4/RRM2/MCM2/STMN1/CDCA7/…
E2F_Q3_01 2.102959 0.6092334 0.0000000 0.0000001 0.8012156 182 MCM4/MCM2/STMN1/CDCA7/MCM7/…
E2F_Q4 2.076081 0.6000867 0.0000000 0.0000001 0.7881868 187 MCM4/RRM2/MCM2/CLSPN/STMN1/…
E2F_03 2.052959 0.5969960 0.0000000 0.0000002 0.7749390 179 MCM4/MCM2/STMN1/CDCA7/MCM7/…
E2F4DP1_01 2.006712 0.5775346 0.0000000 0.0000010 0.7477397 190 MCM4/RRM2/MCM2/CLSPN/STMN1/…
E2F1DP1RB_01 2.057916 0.6109312 0.0000000 0.0000021 0.7195128 145 RRM2/MCM2/CLSPN/STMN1/CDCA7/…
KRCTCNNNNMANAGC_UNKNOWN 2.163679 0.7694695 0.0000008 0.0000312 0.6594444 43 H3C2/UBE2C/H2AC14/H2BC11/H2AC16/…
OCT1_03 -1.484513 -0.4114424 0.0038742 0.0908938 0.4317077 110 TP53BP1/ARL1/RMI1/AAK1/AMD1/…
AAGWWRNYGGC_UNKNOWN -1.592205 -0.4453427 0.0019950 0.0507073 0.4317077 99 ARL1/TMEM187/PCIF1/MYLIP/TRIM8/…

11.3.3.8.3.2 cluster_3
pathway NES ES pval padj log2err size leadingEdge
KRCTCNNNNMANAGC_UNKNOWN 2.00142 0.6711829 5.1e-05 0.0311284 0.5573322 36 H1-4/H3C2/H1-2/H3C10/H3C11/…

12 Run info

12.1 Run time

hourspassed <- (proc.time() - timestart)[3]/60/60
names(hourspassed) <- 'Hours'
hourspassed
##     Hours 
## 0.5844919

12.2 Memory usage

rm(list=ls())

finalmem <- gc(verbose = T, full = T)

mb <- sum(finalmem[,ncol(finalmem)])
gb <- mb / 1000
gb <- setNames(gb,'Gb used (approximately)')

gb
## Gb used (approximately) 
##                  3.9599

12.3 Session info

# beepr::beep()

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux 8.7 (Ootpa)
## 
## Matrix products: default
## BLAS/LAPACK: /gs/gsfs0/home/aferrena/packages/miniconda3/miniconda3/envs/scdapp/lib/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] doParallel_1.0.17     iterators_1.0.14      fgsea_1.28.0         
##  [4] speckle_0.99.7        circlize_0.4.16       Matrix_1.6-5         
##  [7] ggrepel_0.9.5         ggfittext_0.10.2      ggalluvial_0.12.5    
## [10] hdf5r_1.3.9           msigdbr_7.5.1         edgeR_4.0.16         
## [13] limma_3.58.1          ggridges_0.5.6        ggdendro_0.2.0       
## [16] ComplexHeatmap_2.18.0 glmGamPoi_1.14.3      foreach_1.5.2        
## [19] future_1.33.1         DoubletFinder_2.0.4   scDAPP_1.0.0         
## [22] Seurat_5.0.2          SeuratObject_5.0.1    sp_2.1-3             
## [25] RISC_1.6.0            patchwork_1.2.0       lubridate_1.9.3      
## [28] forcats_1.0.0         stringr_1.5.1         dplyr_1.1.4          
## [31] purrr_1.0.2           readr_2.1.5           tidyr_1.3.1          
## [34] tibble_3.2.1          ggplot2_3.5.0         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22            splines_4.3.3              
##   [3] later_1.3.2                 bitops_1.0-7               
##   [5] polyclip_1.10-6             fastDummies_1.7.3          
##   [7] lifecycle_1.0.4             globals_0.16.2             
##   [9] lattice_0.22-5              MASS_7.3-60                
##  [11] magrittr_2.0.3              sass_0.4.8                 
##  [13] rmarkdown_2.25              plotly_4.10.4              
##  [15] jquerylib_0.1.4             yaml_2.3.8                 
##  [17] httpuv_1.6.14               sctransform_0.4.1          
##  [19] spam_2.10-0                 askpass_1.2.0              
##  [21] spatstat.sparse_3.0-3       reticulate_1.35.0          
##  [23] cowplot_1.1.3               pbapply_1.7-2              
##  [25] RColorBrewer_1.1-3          abind_1.4-5                
##  [27] zlibbioc_1.48.0             Rtsne_0.17                 
##  [29] GenomicRanges_1.54.1        BiocGenerics_0.48.1        
##  [31] RCurl_1.98-1.14             GenomeInfoDbData_1.2.11    
##  [33] IRanges_2.36.0              S4Vectors_0.40.2           
##  [35] irlba_2.3.5.1               listenv_0.9.1              
##  [37] spatstat.utils_3.0-4        pheatmap_1.0.12            
##  [39] umap_0.2.10.0               goftest_1.2-3              
##  [41] RSpectra_0.16-1             spatstat.random_3.2-3      
##  [43] fitdistrplus_1.1-11         parallelly_1.37.1          
##  [45] DelayedMatrixStats_1.24.0   leiden_0.4.3.1             
##  [47] codetools_0.2-19            DelayedArray_0.28.0        
##  [49] tidyselect_1.2.0            shape_1.4.6.1              
##  [51] farver_2.1.1                viridis_0.6.5              
##  [53] matrixStats_1.2.0           stats4_4.3.3               
##  [55] spatstat.explore_3.2-6      jsonlite_1.8.8             
##  [57] GetoptLong_1.0.5            ellipsis_0.3.2             
##  [59] progressr_0.14.0            survival_3.5-8             
##  [61] Matrix.utils_0.9.8          tools_4.3.3                
##  [63] ica_1.0-3                   Rcpp_1.0.12                
##  [65] glue_1.7.0                  gridExtra_2.3              
##  [67] SparseArray_1.2.4           mgcv_1.9-1                 
##  [69] xfun_0.42                   MatrixGenerics_1.14.0      
##  [71] GenomeInfoDb_1.38.6         withr_3.0.0                
##  [73] fastmap_1.1.1               fansi_1.0.6                
##  [75] openssl_2.1.1               digest_0.6.34              
##  [77] timechange_0.3.0            R6_2.5.1                   
##  [79] mime_0.12                   colorspace_2.1-0           
##  [81] scattermore_1.2             tensor_1.5                 
##  [83] spatstat.data_3.0-4         utf8_1.2.4                 
##  [85] generics_0.1.3              data.table_1.15.2          
##  [87] FNN_1.1.4                   httr_1.4.7                 
##  [89] htmlwidgets_1.6.4           S4Arrays_1.2.0             
##  [91] uwot_0.1.16                 pkgconfig_2.0.3            
##  [93] gtable_0.3.4                lmtest_0.9-40              
##  [95] SingleCellExperiment_1.24.0 XVector_0.42.0             
##  [97] htmltools_0.5.7             dotCall64_1.1-1            
##  [99] clue_0.3-65                 scales_1.3.0               
## [101] Biobase_2.62.0              png_0.1-8                  
## [103] knitr_1.45                  tzdb_0.4.0                 
## [105] reshape2_1.4.4              rjson_0.2.21               
## [107] nlme_3.1-164                cachem_1.0.8               
## [109] zoo_1.8-12                  GlobalOptions_0.1.2        
## [111] KernSmooth_2.23-22          miniUI_0.1.1.1             
## [113] pillar_1.9.0                vctrs_0.6.5                
## [115] RANN_2.6.1                  promises_1.2.1             
## [117] xtable_1.8-4                cluster_2.1.6              
## [119] evaluate_0.23               locfit_1.5-9.8             
## [121] cli_3.6.2                   compiler_4.3.3             
## [123] rlang_1.1.3                 crayon_1.5.2               
## [125] grr_0.9.5                   future.apply_1.11.1        
## [127] labeling_0.4.3              densityClust_0.3.3         
## [129] plyr_1.8.9                  stringi_1.8.3              
## [131] BiocParallel_1.36.0         viridisLite_0.4.2          
## [133] deldir_2.0-4                babelgene_22.9             
## [135] munsell_0.5.0               lazyeval_0.2.2             
## [137] spatstat.geom_3.2-9         RcppHNSW_0.6.0             
## [139] hms_1.1.3                   sparseMatrixStats_1.14.0   
## [141] bit64_4.0.5                 statmod_1.5.0              
## [143] shiny_1.8.0                 highr_0.10                 
## [145] SummarizedExperiment_1.32.0 ROCR_1.0-11                
## [147] igraph_1.6.0                bslib_0.6.1                
## [149] fastmatch_1.1-4             bit_4.0.5